## Package Imports

In [104]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import pos_tag
from nltk.corpus import wordnet

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Format Data

In [105]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [106]:
def corpus_to_df(corpus):

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## DF Level Cleaning

In [107]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)
URL_RE = re.compile(r"http\S+|www\.\S+")
HAS_LETTER_RE = re.compile(r"[A-Za-z]")

## Cleaning

In [108]:
def clean_text(text):

    # remove urls
    text = URL_RE.sub("", text)

    return text

In [109]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [119]:
def get_wordnet_pos(treebank_tag):
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [112]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    tagged = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(tok, get_wordnet_pos(tag)) for tok, tag in tagged]

    return lemmatized_tokens

In [113]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation and numbers
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''
    
    # clean text
    text = clean_text(text)

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue

        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Preprocess Function

In [114]:
def preprocess_df(df):
    # remove deleted/removed utterances
    df = df[~df["text"].str.lower().isin({"[deleted]", "[removed]"})]

    # remove bot authored utterances
    df = df[~df["text"].str.contains(BOT_TEXT_RE)]

    # remove utterances without a letter
    df = df[df["text"].str.contains(HAS_LETTER_RE)]

    # tokenize
    df["tokens"] = df["text"].apply(tokenize)

    # lemmatize
    df["lemmas"] = df["tokens"].apply(lemmatize)

    # final
    df["final"] = df["text"].apply(clean_tokens_lexical)

    return df

## Testing

In [120]:
import random

In [121]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 100)

In [122]:
pd.set_option('display.max_rows', 100)

In [123]:
df = corpus_to_df(corpus)
df = preprocess_df(df)

df

  df = df[~df["text"].str.contains(BOT_TEXT_RE)]


Unnamed: 0,utterance_id,speaker_id,text,timestamp,tokens,lemmas,final
0,nyx4d,reddmau5,I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?,2012-01-01 16:18:18,"[I, was, just, reading, about, the, Princeton, Mic-Check, and, it's, getting, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interesting, coming, up, ?]","[I, be, just, read, about, the, Princeton, Mic-Check, and, it's, get, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interest, come, up, ?]","[i, be, just, read, about, the, princeton, and, get, national, press, i, want, to, get, a, sense, of, what, people, felt, like, around, campus, anything, interesting, happen, anything, interest, come, up]"
1,o0145,shtylman,"I have added support for Cornell to courseoff.com (https://cornell.courseoff.com). Courseoff is a free web app to help you plan your semester schedules. It is very popular with students at some of the other schools I support.\n\nNo signup is required to use it so feel free to try it out! You can create an account which allows multiple schedules, saving schedules, and sharing schedules.\n\nLet me know what you guys think! Any feedback is always appreciated. If you like it, tell your friends :) If you find a problem, let me know as well.",2012-01-02 13:57:15,"[I, have, added, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, is, a, free, web, app, to, help, you, plan, your, semester, schedules, ., It, is, very, popular, with, students, at, some, of, the, other, schools, I, support, ., No, signup, is, required, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allows, multiple, schedules, ,, saving, schedules, ,, and, sharing, schedules, ., Let, me, know, what, you, guys, think, !, Any, feedback, is, always, appreciated, ., If, you, like, it, ,, tell, your, friends, :), If, you, find, a, ...]","[I, have, add, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, ., It, be, very, popular, with, student, at, some, of, the, other, school, I, support, ., No, signup, be, require, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allow, multiple, schedule, ,, save, schedule, ,, and, share, schedule, ., Let, me, know, what, you, guy, think, !, Any, feedback, be, always, appreciate, ., If, you, like, it, ,, tell, your, friend, :), If, you, find, a, ...]","[i, have, add, support, for, cornell, to, courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, it, be, very, popular, with, student, at, some, of, the, other, school, i, support, no, signup, be, require, to, use, it, so, feel, free, to, try, it, out, you, can, create, an, account, which, allow, multiple, schedule, save, schedule, and, share, schedule, let, me, know, what, you, guy, think, any, feedback, be, always, appreciate, if, you, like, it, tell, your, friend, if, you, find, a, problem, let, me, know, as, well]"
2,o1gca,moon_river,"i don't have a facebook, so we'd need a volunteer.. just someone to let cornell on facebook know that we have a presence on reddit.. perhaps a small explanation of what reddit is? now that we are almost beautiful and such.. we need more redditors!",2012-01-03 14:55:06,"[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, is, ?, now, that, we, are, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, be, ?, now, that, we, be, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, have, a, facebook, so, need, a, volunteer, just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, perhaps, a, small, explanation, of, what, reddit, be, now, that, we, be, almost, beautiful, and, such, we, need, more, redditors]"
3,o0ss4,moon_river,"so, i'm starting to mess with some of the css on our lovely subreddit.. anyone have any fun suggestions about our little envelope? or up/downvote things? GO NUTS.",2012-01-03 01:16:17,"[so, ,, i'm, starting, to, mess, with, some, of, the, css, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestions, about, our, little, envelope, ?, or, up, /, downvote, things, ?, GO, NUTS, .]","[so, ,, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestion, about, our, little, envelope, ?, or, up, /, downvote, thing, ?, GO, NUTS, .]","[so, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, anyone, have, any, fun, suggestion, about, our, little, envelope, or, up, downvote, thing, go, nuts]"
4,o4ipd,reddmau5,"Ever since SOPA put fear into the hearts of everyone that loves the internet, it looks like [The DarkNet Plan](http://www.reddit.com/r/darknetplan) has grown by the thousands and even got [national media attention](http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/).\n\nWhat is the feasibility of doing that for our big red campus?\n\nRelevant: I miss DC++",2012-01-05 17:08:06,"[Ever, since, SOPA, put, fear, into, the, hearts, of, everyone, that, loves, the, internet, ,, it, looks, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), has, grown, by, the, thousands, and, even, got, [, national, media, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, is, the, feasibility, of, doing, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[Ever, since, SOPA, put, fear, into, the, heart, of, everyone, that, love, the, internet, ,, it, look, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), have, grow, by, the, thousand, and, even, get, [, national, medium, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, be, the, feasibility, of, do, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[ever, since, sopa, put, fear, into, the, heart, of, everyone, that, love, the, internet, it, look, like, the, darknet, plan, have, grow, by, the, thousand, and, even, get, national, medium, attention, what, be, the, feasibility, of, do, that, for, our, big, red, campus, relevant, i, miss, dc]"
...,...,...,...,...,...,...,...
72981,e8tj9ig,JCsurfing,"If your ECs are good, maybe.",2018-10-31 19:39:29,"[If, your, ECs, are, good, ,, maybe, .]","[If, your, ECs, be, good, ,, maybe, .]","[if, your, ec, be, good, maybe]"
72983,e8tjyg1,ultimatefishlover,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼ ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ**,2018-10-31 19:50:51,"[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the]"
72984,e8tkb66,KickAssEmployee,Agreed,2018-10-31 19:56:45,[Agreed],[Agreed],[agree]
72985,e8tkctl,dasfsadf123,Why did this make me laugh so hard ahaahha,2018-10-31 19:57:30,"[Why, did, this, make, me, laugh, so, hard, ahaahha]","[Why, do, this, make, me, laugh, so, hard, ahaahha]","[why, do, this, make, me, laugh, so, hard, ahaahha]"


In [124]:
df_text = df[["text", "tokens", "lemmas", "final"]]
df_text

Unnamed: 0,text,tokens,lemmas,final
0,I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?,"[I, was, just, reading, about, the, Princeton, Mic-Check, and, it's, getting, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interesting, coming, up, ?]","[I, be, just, read, about, the, Princeton, Mic-Check, and, it's, get, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interest, come, up, ?]","[i, be, just, read, about, the, princeton, and, get, national, press, i, want, to, get, a, sense, of, what, people, felt, like, around, campus, anything, interesting, happen, anything, interest, come, up]"
1,"I have added support for Cornell to courseoff.com (https://cornell.courseoff.com). Courseoff is a free web app to help you plan your semester schedules. It is very popular with students at some of the other schools I support.\n\nNo signup is required to use it so feel free to try it out! You can create an account which allows multiple schedules, saving schedules, and sharing schedules.\n\nLet me know what you guys think! Any feedback is always appreciated. If you like it, tell your friends :) If you find a problem, let me know as well.","[I, have, added, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, is, a, free, web, app, to, help, you, plan, your, semester, schedules, ., It, is, very, popular, with, students, at, some, of, the, other, schools, I, support, ., No, signup, is, required, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allows, multiple, schedules, ,, saving, schedules, ,, and, sharing, schedules, ., Let, me, know, what, you, guys, think, !, Any, feedback, is, always, appreciated, ., If, you, like, it, ,, tell, your, friends, :), If, you, find, a, ...]","[I, have, add, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, ., It, be, very, popular, with, student, at, some, of, the, other, school, I, support, ., No, signup, be, require, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allow, multiple, schedule, ,, save, schedule, ,, and, share, schedule, ., Let, me, know, what, you, guy, think, !, Any, feedback, be, always, appreciate, ., If, you, like, it, ,, tell, your, friend, :), If, you, find, a, ...]","[i, have, add, support, for, cornell, to, courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, it, be, very, popular, with, student, at, some, of, the, other, school, i, support, no, signup, be, require, to, use, it, so, feel, free, to, try, it, out, you, can, create, an, account, which, allow, multiple, schedule, save, schedule, and, share, schedule, let, me, know, what, you, guy, think, any, feedback, be, always, appreciate, if, you, like, it, tell, your, friend, if, you, find, a, problem, let, me, know, as, well]"
2,"i don't have a facebook, so we'd need a volunteer.. just someone to let cornell on facebook know that we have a presence on reddit.. perhaps a small explanation of what reddit is? now that we are almost beautiful and such.. we need more redditors!","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, is, ?, now, that, we, are, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, be, ?, now, that, we, be, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, have, a, facebook, so, need, a, volunteer, just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, perhaps, a, small, explanation, of, what, reddit, be, now, that, we, be, almost, beautiful, and, such, we, need, more, redditors]"
3,"so, i'm starting to mess with some of the css on our lovely subreddit.. anyone have any fun suggestions about our little envelope? or up/downvote things? GO NUTS.","[so, ,, i'm, starting, to, mess, with, some, of, the, css, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestions, about, our, little, envelope, ?, or, up, /, downvote, things, ?, GO, NUTS, .]","[so, ,, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestion, about, our, little, envelope, ?, or, up, /, downvote, thing, ?, GO, NUTS, .]","[so, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, anyone, have, any, fun, suggestion, about, our, little, envelope, or, up, downvote, thing, go, nuts]"
4,"Ever since SOPA put fear into the hearts of everyone that loves the internet, it looks like [The DarkNet Plan](http://www.reddit.com/r/darknetplan) has grown by the thousands and even got [national media attention](http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/).\n\nWhat is the feasibility of doing that for our big red campus?\n\nRelevant: I miss DC++","[Ever, since, SOPA, put, fear, into, the, hearts, of, everyone, that, loves, the, internet, ,, it, looks, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), has, grown, by, the, thousands, and, even, got, [, national, media, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, is, the, feasibility, of, doing, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[Ever, since, SOPA, put, fear, into, the, heart, of, everyone, that, love, the, internet, ,, it, look, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), have, grow, by, the, thousand, and, even, get, [, national, medium, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, be, the, feasibility, of, do, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[ever, since, sopa, put, fear, into, the, heart, of, everyone, that, love, the, internet, it, look, like, the, darknet, plan, have, grow, by, the, thousand, and, even, get, national, medium, attention, what, be, the, feasibility, of, do, that, for, our, big, red, campus, relevant, i, miss, dc]"
...,...,...,...,...
72981,"If your ECs are good, maybe.","[If, your, ECs, are, good, ,, maybe, .]","[If, your, ECs, be, good, ,, maybe, .]","[if, your, ec, be, good, maybe]"
72983,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼ ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ**,"[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the]"
72984,Agreed,[Agreed],[Agreed],[agree]
72985,Why did this make me laugh so hard ahaahha,"[Why, did, this, make, me, laugh, so, hard, ahaahha]","[Why, do, this, make, me, laugh, so, hard, ahaahha]","[why, do, this, make, me, laugh, so, hard, ahaahha]"


In [125]:
pd.set_option('display.max_colwidth', None)
df_text.sample(50)

Unnamed: 0,text,tokens,lemmas,final
25789,"Don't know if a younger premed can chime in, but at least when I took psych 1101, it didn't correspond well with the experimental psych section (I took the old MCAT). I think I heard that you will need to do some extra work on your own for MCAT psych.","[Don't, know, if, a, younger, premed, can, chime, in, ,, but, at, least, when, I, took, psych, 1101, ,, it, didn't, correspond, well, with, the, experimental, psych, section, (, I, took, the, old, MCAT, ), ., I, think, I, heard, that, you, will, need, to, do, some, extra, work, on, your, own, for, MCAT, psych, .]","[Don't, know, if, a, young, premed, can, chime, in, ,, but, at, least, when, I, take, psych, 1101, ,, it, didn't, correspond, well, with, the, experimental, psych, section, (, I, take, the, old, MCAT, ), ., I, think, I, hear, that, you, will, need, to, do, some, extra, work, on, your, own, for, MCAT, psych, .]","[know, if, a, young, premed, can, chime, in, but, at, least, when, i, take, psych, it, correspond, well, with, the, experimental, psych, section, i, take, the, old, mcat, i, think, i, hear, that, you, will, need, to, do, some, extra, work, on, your, own, for, mcat, psych]"
38749,Professor White just sent out an email offering people consultant positions so I would check your cornell email.,"[Professor, White, just, sent, out, an, email, offering, people, consultant, positions, so, I, would, check, your, cornell, email, .]","[Professor, White, just, send, out, an, email, offering, people, consultant, position, so, I, would, check, your, cornell, email, .]","[professor, white, just, send, out, an, email, offering, people, consultant, position, so, i, would, check, your, cornell, email]"
21156,"It'll vary by class whether or not an older edition is fine, so I'd ask the professor for each class. And I don't know if this is the best place for it, but people buy sell textbooks on the official Facebook groups fairly often, so you could try there.","[It'll, vary, by, class, whether, or, not, an, older, edition, is, fine, ,, so, I'd, ask, the, professor, for, each, class, ., And, I, don't, know, if, this, is, the, best, place, for, it, ,, but, people, buy, sell, textbooks, on, the, official, Facebook, groups, fairly, often, ,, so, you, could, try, there, .]","[It'll, vary, by, class, whether, or, not, an, old, edition, be, fine, ,, so, I'd, ask, the, professor, for, each, class, ., And, I, don't, know, if, this, be, the, best, place, for, it, ,, but, people, buy, sell, textbook, on, the, official, Facebook, group, fairly, often, ,, so, you, could, try, there, .]","[vary, by, class, whether, or, not, an, old, edition, be, fine, so, ask, the, professor, for, each, class, and, i, know, if, this, be, the, best, place, for, it, but, people, buy, sell, textbook, on, the, official, facebook, group, fairly, often, so, you, could, try, there]"
36291,"Current ChemE rising junior that loves research here, I'd say to start as soon as you can! Research is such a valuable thing to spend your time on at Cornell, not just for the benefits mentioned above but for the satisfaction of contributing to something greater than yourself! Find a professor in the ChemE department (or any department really) whose research interests you and email them saying:\n\nWho you are\nWhy their research interests you\nCan I come by your office to talk about it more\n\nThis is your best bet because if they do have positions open, you can have a face to face way of asking about it which puts your face in their head making your chances better. If they don't have any positions open, you'll still have made that connection by talking in person and they can point you to another faculty member that might have an open spot or might even better suit your interests! Good luck finding a lab, shoot me a message if you want more personalized help!","[Current, ChemE, rising, junior, that, loves, research, here, ,, I'd, say, to, start, as, soon, as, you, can, !, Research, is, such, a, valuable, thing, to, spend, your, time, on, at, Cornell, ,, not, just, for, the, benefits, mentioned, above, but, for, the, satisfaction, of, contributing, to, something, greater, than, yourself, !, Find, a, professor, in, the, ChemE, department, (, or, any, department, really, ), whose, research, interests, you, and, email, them, saying, :, Who, you, are, Why, their, research, interests, you, Can, I, come, by, your, office, to, talk, about, it, more, This, is, your, best, bet, because, if, ...]","[Current, ChemE, rise, junior, that, love, research, here, ,, I'd, say, to, start, as, soon, a, you, can, !, Research, be, such, a, valuable, thing, to, spend, your, time, on, at, Cornell, ,, not, just, for, the, benefit, mention, above, but, for, the, satisfaction, of, contribute, to, something, great, than, yourself, !, Find, a, professor, in, the, ChemE, department, (, or, any, department, really, ), whose, research, interest, you, and, email, them, say, :, Who, you, be, Why, their, research, interest, you, Can, I, come, by, your, office, to, talk, about, it, more, This, be, your, best, bet, because, if, ...]","[current, cheme, rise, junior, that, love, research, here, say, to, start, as, soon, a, you, can, research, be, such, a, valuable, thing, to, spend, your, time, on, at, cornell, not, just, for, the, benefit, mention, above, but, for, the, satisfaction, of, contribute, to, something, great, than, yourself, find, a, professor, in, the, cheme, department, or, any, department, really, whose, research, interest, you, and, email, them, say, who, you, be, why, their, research, interest, you, can, i, come, by, your, office, to, talk, about, it, more, this, be, your, best, bet, because, if, they, do, have, position, open, you, can, have, ...]"
33411,"They call me Seymour Butts, cause I get mo' ass than most\n\nThey say I'm next and got that butter love, and get too close\n\nFollow the leader cause I'm meaner than medula oblongota\n\nMy Tribe's on more Quests than Midnight Marauders","[They, call, me, Seymour, Butts, ,, cause, I, get, mo, ', ass, than, most, They, say, I'm, next, and, got, that, butter, love, ,, and, get, too, close, Follow, the, leader, cause, I'm, meaner, than, medula, oblongota, My, Tribe's, on, more, Quests, than, Midnight, Marauders]","[They, call, me, Seymour, Butts, ,, cause, I, get, mo, ', as, than, most, They, say, I'm, next, and, get, that, butter, love, ,, and, get, too, close, Follow, the, leader, cause, I'm, meaner, than, medula, oblongota, My, Tribe's, on, more, Quests, than, Midnight, Marauders]","[they, call, me, seymour, butt, cause, i, get, mo, as, than, most, they, say, next, and, get, that, butter, love, and, get, too, close, follow, the, leader, cause, meaner, than, medula, oblongota, my, on, more, quest, than, midnight, marauder]"
30745,"It might? It won't hurt at least. Cornell has a very very generous drop policy (drop without question up until October 18th, and a fairly easy petition for drop up until November 10th). For reference, your second prelim will typically be a week before November 10th, and finals are on December 10th.\n\nI'd sign up for the class if you're interested at all, and if you find that you can't/are not willing to handle the 15 hour a week homework assignments, drop it. I'd actually suggest that for most classes at Cornell.\n\nOne more thing. When I say 15 hour a week homework assignments, I mean it. I don't know what kind of math background you have, but in the past, I've always disregarded these kinds of warnings. Calc teacher said the class would take at least an hour a day (it didn't), Lin Alg professor at community college said the class would take 2 hours a day (it didn't), etc. Even at Cornell, I've found classes like CS 2800 and CS 3110 to take significantly less time than most people quote. It's only MATH 2230 that's lived up to the billed workload.\n\nIf you have any more questions, feel free to send me a PM or add me on Facebook or something like that. I had a lot of questions when I first got to Cornell, and some very helpful people really helped me get my orientation.","[It, might, ?, It, won't, hurt, at, least, ., Cornell, has, a, very, very, generous, drop, policy, (, drop, without, question, up, until, October, 18th, ,, and, a, fairly, easy, petition, for, drop, up, until, November, 10th, ), ., For, reference, ,, your, second, prelim, will, typically, be, a, week, before, November, 10th, ,, and, finals, are, on, December, 10th, ., I'd, sign, up, for, the, class, if, you're, interested, at, all, ,, and, if, you, find, that, you, can't, /, are, not, willing, to, handle, the, 15, hour, a, week, homework, assignments, ,, drop, it, ., I'd, actually, suggest, ...]","[It, might, ?, It, won't, hurt, at, least, ., Cornell, have, a, very, very, generous, drop, policy, (, drop, without, question, up, until, October, 18th, ,, and, a, fairly, easy, petition, for, drop, up, until, November, 10th, ), ., For, reference, ,, your, second, prelim, will, typically, be, a, week, before, November, 10th, ,, and, final, be, on, December, 10th, ., I'd, sign, up, for, the, class, if, you're, interested, at, all, ,, and, if, you, find, that, you, can't, /, be, not, willing, to, handle, the, 15, hour, a, week, homework, assignment, ,, drop, it, ., I'd, actually, suggest, ...]","[it, might, it, hurt, at, least, cornell, have, a, very, very, generous, drop, policy, drop, without, question, up, until, october, and, a, fairly, easy, petition, for, drop, up, until, november, for, reference, your, second, prelim, will, typically, be, a, week, before, november, and, final, be, on, december, sign, up, for, the, class, if, interested, at, all, and, if, you, find, that, you, be, not, willing, to, handle, the, hour, a, week, homework, assignment, drop, it, actually, suggest, that, for, most, class, at, cornell, one, more, thing, when, i, say, hour, a, week, homework, assignment, i, mean, it, i, know, what, ...]"
54740,"There’s parking permits for North Campus - they’re not cheap (~$750 per year I think). You can also rent parking spaces from houses in the area, although that’s more prevalent on West and in Collegetown.","[There, ’, s, parking, permits, for, North, Campus, -, they, ’, re, not, cheap, (, ~, $, 750, per, year, I, think, ), ., You, can, also, rent, parking, spaces, from, houses, in, the, area, ,, although, that, ’, s, more, prevalent, on, West, and, in, Collegetown, .]","[There, ’, s, park, permit, for, North, Campus, -, they, ’, re, not, cheap, (, ~, $, 750, per, year, I, think, ), ., You, can, also, rent, park, space, from, house, in, the, area, ,, although, that, ’, s, more, prevalent, on, West, and, in, Collegetown, .]","[there, s, parking, permit, for, north, campus, they, re, not, cheap, per, year, i, think, you, can, also, rent, park, space, from, house, in, the, area, although, that, s, more, prevalent, on, west, and, in, collegetown]"
7671,"Short of cooking, what are some cheap and filling options for lunch around campus/ctown? Right now the best I've found is $6 pasta at the little cafeteria style place in Statler.","[Short, of, cooking, ,, what, are, some, cheap, and, filling, options, for, lunch, around, campus, /, ctown, ?, Right, now, the, best, I've, found, is, $, 6, pasta, at, the, little, cafeteria, style, place, in, Statler, .]","[Short, of, cooking, ,, what, be, some, cheap, and, filling, option, for, lunch, around, campus, /, ctown, ?, Right, now, the, best, I've, find, be, $, 6, pasta, at, the, little, cafeteria, style, place, in, Statler, .]","[short, of, cook, what, be, some, cheap, and, filling, option, for, lunch, around, campus, ctown, right, now, the, best, found, be, pasta, at, the, little, cafeteria, style, place, in, statler]"
54364,"I'm sorry you feel that way?\n\nLife can be good here, but generally people who are having a good time don't have any reason to post. Go talk to some graduating seniors and you might get a different impression. \n\nIf you asked me why I'm here, it's because I love the things I'm learning and the amazing people around me. I do the things I do because they interest me or make me happy. You're not wrong that life isn't all about grades. Maybe try to go out and enjoy it a bit more if you can?","[I'm, sorry, you, feel, that, way, ?, Life, can, be, good, here, ,, but, generally, people, who, are, having, a, good, time, don't, have, any, reason, to, post, ., Go, talk, to, some, graduating, seniors, and, you, might, get, a, different, impression, ., If, you, asked, me, why, I'm, here, ,, it's, because, I, love, the, things, I'm, learning, and, the, amazing, people, around, me, ., I, do, the, things, I, do, because, they, interest, me, or, make, me, happy, ., You're, not, wrong, that, life, isn't, all, about, grades, ., Maybe, try, to, go, out, and, enjoy, it, a, ...]","[I'm, sorry, you, feel, that, way, ?, Life, can, be, good, here, ,, but, generally, people, who, be, have, a, good, time, don't, have, any, reason, to, post, ., Go, talk, to, some, graduating, senior, and, you, might, get, a, different, impression, ., If, you, ask, me, why, I'm, here, ,, it's, because, I, love, the, thing, I'm, learning, and, the, amazing, people, around, me, ., I, do, the, thing, I, do, because, they, interest, me, or, make, me, happy, ., You're, not, wrong, that, life, isn't, all, about, grade, ., Maybe, try, to, go, out, and, enjoy, it, a, ...]","[sorry, you, feel, that, way, life, can, be, good, here, but, generally, people, who, be, have, a, good, time, have, any, reason, to, post, go, talk, to, some, graduating, senior, and, you, might, get, a, different, impression, if, you, ask, me, why, here, because, i, love, the, thing, learning, and, the, amazing, people, around, me, i, do, the, thing, i, do, because, they, interest, me, or, make, me, happy, not, wrong, that, life, all, about, grade, maybe, try, to, go, out, and, enjoy, it, a, bit, more, if, you, can]"
71738,I'm actually involved in Cornell Minds Matter and Women in Computing so I don't want to try new clubs at this point but I don't know how to introduce myself to new people because I get very nervous.,"[I'm, actually, involved, in, Cornell, Minds, Matter, and, Women, in, Computing, so, I, don't, want, to, try, new, clubs, at, this, point, but, I, don't, know, how, to, introduce, myself, to, new, people, because, I, get, very, nervous, .]","[I'm, actually, involve, in, Cornell, Minds, Matter, and, Women, in, Computing, so, I, don't, want, to, try, new, club, at, this, point, but, I, don't, know, how, to, introduce, myself, to, new, people, because, I, get, very, nervous, .]","[actually, involve, in, cornell, mind, matter, and, woman, in, compute, so, i, want, to, try, new, club, at, this, point, but, i, know, how, to, introduce, myself, to, new, people, because, i, get, very, nervous]"
