## Package Imports

In [104]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import pos_tag
from nltk.corpus import wordnet

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Format Data

In [105]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [106]:
def corpus_to_df(corpus):
    '''Function to convert the convokit corpus to a pandas dataframe structure.'''

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## DF Level Cleaning

In [107]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)
URL_RE = re.compile(r"http\S+|www\.\S+")
HAS_LETTER_RE = re.compile(r"[A-Za-z]")

## Cleaning

In [108]:
def clean_text(text):
    '''Helper function to clean text by removing urls and other undesirable features.'''

    # remove urls
    text = URL_RE.sub("", text)

    return text

In [109]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [119]:
def get_wordnet_pos(treebank_tag):
    '''Helper function to map treebank-based POS tags to wordnet POS tags.'''
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    
    # otherwise default to noun
    else:
        return wordnet.NOUN

In [112]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    tagged = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(tok, get_wordnet_pos(tag)) for tok, tag in tagged]

    return lemmatized_tokens

In [127]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation and numbers
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''
    
    # clean text
    text = clean_text(text)

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue

        # only keep alphabetic tokens, including contractions
        if re.fullmatch(r"[A-Za-z]+(?:['’][A-Za-z]+)*", tok):
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Preprocess Function

In [114]:
def preprocess_df(df):
    # remove deleted/removed utterances
    df = df[~df["text"].str.lower().isin({"[deleted]", "[removed]"})]

    # remove bot authored utterances
    df = df[~df["text"].str.contains(BOT_TEXT_RE)]

    # remove utterances without a letter
    df = df[df["text"].str.contains(HAS_LETTER_RE)]

    # tokenize
    df["tokens"] = df["text"].apply(tokenize)

    # lemmatize
    df["lemmas"] = df["tokens"].apply(lemmatize)

    # final
    df["final"] = df["text"].apply(clean_tokens_lexical)

    return df

## Testing

In [128]:
import random

In [129]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 100)

In [130]:
pd.set_option('display.max_rows', 100)

In [131]:
df = corpus_to_df(corpus)
df = preprocess_df(df)

df

  df = df[~df["text"].str.contains(BOT_TEXT_RE)]


Unnamed: 0,utterance_id,speaker_id,text,timestamp,tokens,lemmas,final
0,nyx4d,reddmau5,I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?,2012-01-01 16:18:18,"[I, was, just, reading, about, the, Princeton, Mic-Check, and, it's, getting, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interesting, coming, up, ?]","[I, be, just, read, about, the, Princeton, Mic-Check, and, it's, get, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interest, come, up, ?]","[i, be, just, read, about, the, princeton, and, it's, get, national, press, i, want, to, get, a, sense, of, what, people, felt, like, around, campus, anything, interesting, happen, anything, interest, come, up]"
1,o0145,shtylman,"I have added support for Cornell to courseoff.com (https://cornell.courseoff.com). Courseoff is a free web app to help you plan your semester schedules. It is very popular with students at some of the other schools I support.\n\nNo signup is required to use it so feel free to try it out! You can create an account which allows multiple schedules, saving schedules, and sharing schedules.\n\nLet me know what you guys think! Any feedback is always appreciated. If you like it, tell your friends :) If you find a problem, let me know as well.",2012-01-02 13:57:15,"[I, have, added, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, is, a, free, web, app, to, help, you, plan, your, semester, schedules, ., It, is, very, popular, with, students, at, some, of, the, other, schools, I, support, ., No, signup, is, required, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allows, multiple, schedules, ,, saving, schedules, ,, and, sharing, schedules, ., Let, me, know, what, you, guys, think, !, Any, feedback, is, always, appreciated, ., If, you, like, it, ,, tell, your, friends, :), If, you, find, a, ...]","[I, have, add, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, ., It, be, very, popular, with, student, at, some, of, the, other, school, I, support, ., No, signup, be, require, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allow, multiple, schedule, ,, save, schedule, ,, and, share, schedule, ., Let, me, know, what, you, guy, think, !, Any, feedback, be, always, appreciate, ., If, you, like, it, ,, tell, your, friend, :), If, you, find, a, ...]","[i, have, add, support, for, cornell, to, courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, it, be, very, popular, with, student, at, some, of, the, other, school, i, support, no, signup, be, require, to, use, it, so, feel, free, to, try, it, out, you, can, create, an, account, which, allow, multiple, schedule, save, schedule, and, share, schedule, let, me, know, what, you, guy, think, any, feedback, be, always, appreciate, if, you, like, it, tell, your, friend, if, you, find, a, problem, let, me, know, as, well]"
2,o1gca,moon_river,"i don't have a facebook, so we'd need a volunteer.. just someone to let cornell on facebook know that we have a presence on reddit.. perhaps a small explanation of what reddit is? now that we are almost beautiful and such.. we need more redditors!",2012-01-03 14:55:06,"[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, is, ?, now, that, we, are, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, be, ?, now, that, we, be, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, so, we'd, need, a, volunteer, just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, perhaps, a, small, explanation, of, what, reddit, be, now, that, we, be, almost, beautiful, and, such, we, need, more, redditors]"
3,o0ss4,moon_river,"so, i'm starting to mess with some of the css on our lovely subreddit.. anyone have any fun suggestions about our little envelope? or up/downvote things? GO NUTS.",2012-01-03 01:16:17,"[so, ,, i'm, starting, to, mess, with, some, of, the, css, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestions, about, our, little, envelope, ?, or, up, /, downvote, things, ?, GO, NUTS, .]","[so, ,, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestion, about, our, little, envelope, ?, or, up, /, downvote, thing, ?, GO, NUTS, .]","[so, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, anyone, have, any, fun, suggestion, about, our, little, envelope, or, up, downvote, thing, go, nuts]"
4,o4ipd,reddmau5,"Ever since SOPA put fear into the hearts of everyone that loves the internet, it looks like [The DarkNet Plan](http://www.reddit.com/r/darknetplan) has grown by the thousands and even got [national media attention](http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/).\n\nWhat is the feasibility of doing that for our big red campus?\n\nRelevant: I miss DC++",2012-01-05 17:08:06,"[Ever, since, SOPA, put, fear, into, the, hearts, of, everyone, that, loves, the, internet, ,, it, looks, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), has, grown, by, the, thousands, and, even, got, [, national, media, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, is, the, feasibility, of, doing, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[Ever, since, SOPA, put, fear, into, the, heart, of, everyone, that, love, the, internet, ,, it, look, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), have, grow, by, the, thousand, and, even, get, [, national, medium, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, be, the, feasibility, of, do, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[ever, since, sopa, put, fear, into, the, heart, of, everyone, that, love, the, internet, it, look, like, the, darknet, plan, have, grow, by, the, thousand, and, even, get, national, medium, attention, what, be, the, feasibility, of, do, that, for, our, big, red, campus, relevant, i, miss, dc]"
...,...,...,...,...,...,...,...
72981,e8tj9ig,JCsurfing,"If your ECs are good, maybe.",2018-10-31 19:39:29,"[If, your, ECs, are, good, ,, maybe, .]","[If, your, ECs, be, good, ,, maybe, .]","[if, your, ec, be, good, maybe]"
72983,e8tjyg1,ultimatefishlover,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼ ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ**,2018-10-31 19:50:51,"[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the]"
72984,e8tkb66,KickAssEmployee,Agreed,2018-10-31 19:56:45,[Agreed],[Agreed],[agree]
72985,e8tkctl,dasfsadf123,Why did this make me laugh so hard ahaahha,2018-10-31 19:57:30,"[Why, did, this, make, me, laugh, so, hard, ahaahha]","[Why, do, this, make, me, laugh, so, hard, ahaahha]","[why, do, this, make, me, laugh, so, hard, ahaahha]"


In [132]:
df_text = df[["text", "tokens", "lemmas", "final"]]
df_text

Unnamed: 0,text,tokens,lemmas,final
0,I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?,"[I, was, just, reading, about, the, Princeton, Mic-Check, and, it's, getting, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interesting, coming, up, ?]","[I, be, just, read, about, the, Princeton, Mic-Check, and, it's, get, [, national, press, ], (, http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html, ), ., I, want, to, get, a, sense, of, what, people, felt, like, around, campus, ., Anything, interesting, happen, ?, Anything, interest, come, up, ?]","[i, be, just, read, about, the, princeton, and, it's, get, national, press, i, want, to, get, a, sense, of, what, people, felt, like, around, campus, anything, interesting, happen, anything, interest, come, up]"
1,"I have added support for Cornell to courseoff.com (https://cornell.courseoff.com). Courseoff is a free web app to help you plan your semester schedules. It is very popular with students at some of the other schools I support.\n\nNo signup is required to use it so feel free to try it out! You can create an account which allows multiple schedules, saving schedules, and sharing schedules.\n\nLet me know what you guys think! Any feedback is always appreciated. If you like it, tell your friends :) If you find a problem, let me know as well.","[I, have, added, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, is, a, free, web, app, to, help, you, plan, your, semester, schedules, ., It, is, very, popular, with, students, at, some, of, the, other, schools, I, support, ., No, signup, is, required, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allows, multiple, schedules, ,, saving, schedules, ,, and, sharing, schedules, ., Let, me, know, what, you, guys, think, !, Any, feedback, is, always, appreciated, ., If, you, like, it, ,, tell, your, friends, :), If, you, find, a, ...]","[I, have, add, support, for, Cornell, to, courseoff.com, (, https://cornell.courseoff.com, ), ., Courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, ., It, be, very, popular, with, student, at, some, of, the, other, school, I, support, ., No, signup, be, require, to, use, it, so, feel, free, to, try, it, out, !, You, can, create, an, account, which, allow, multiple, schedule, ,, save, schedule, ,, and, share, schedule, ., Let, me, know, what, you, guy, think, !, Any, feedback, be, always, appreciate, ., If, you, like, it, ,, tell, your, friend, :), If, you, find, a, ...]","[i, have, add, support, for, cornell, to, courseoff, be, a, free, web, app, to, help, you, plan, your, semester, schedule, it, be, very, popular, with, student, at, some, of, the, other, school, i, support, no, signup, be, require, to, use, it, so, feel, free, to, try, it, out, you, can, create, an, account, which, allow, multiple, schedule, save, schedule, and, share, schedule, let, me, know, what, you, guy, think, any, feedback, be, always, appreciate, if, you, like, it, tell, your, friend, if, you, find, a, problem, let, me, know, as, well]"
2,"i don't have a facebook, so we'd need a volunteer.. just someone to let cornell on facebook know that we have a presence on reddit.. perhaps a small explanation of what reddit is? now that we are almost beautiful and such.. we need more redditors!","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, is, ?, now, that, we, are, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, ,, so, we'd, need, a, volunteer, .., just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, .., perhaps, a, small, explanation, of, what, reddit, be, ?, now, that, we, be, almost, beautiful, and, such, .., we, need, more, redditors, !]","[i, don't, have, a, facebook, so, we'd, need, a, volunteer, just, someone, to, let, cornell, on, facebook, know, that, we, have, a, presence, on, reddit, perhaps, a, small, explanation, of, what, reddit, be, now, that, we, be, almost, beautiful, and, such, we, need, more, redditors]"
3,"so, i'm starting to mess with some of the css on our lovely subreddit.. anyone have any fun suggestions about our little envelope? or up/downvote things? GO NUTS.","[so, ,, i'm, starting, to, mess, with, some, of, the, css, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestions, about, our, little, envelope, ?, or, up, /, downvote, things, ?, GO, NUTS, .]","[so, ,, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, .., anyone, have, any, fun, suggestion, about, our, little, envelope, ?, or, up, /, downvote, thing, ?, GO, NUTS, .]","[so, i'm, start, to, mess, with, some, of, the, cs, on, our, lovely, subreddit, anyone, have, any, fun, suggestion, about, our, little, envelope, or, up, downvote, thing, go, nuts]"
4,"Ever since SOPA put fear into the hearts of everyone that loves the internet, it looks like [The DarkNet Plan](http://www.reddit.com/r/darknetplan) has grown by the thousands and even got [national media attention](http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/).\n\nWhat is the feasibility of doing that for our big red campus?\n\nRelevant: I miss DC++","[Ever, since, SOPA, put, fear, into, the, hearts, of, everyone, that, loves, the, internet, ,, it, looks, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), has, grown, by, the, thousands, and, even, got, [, national, media, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, is, the, feasibility, of, doing, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[Ever, since, SOPA, put, fear, into, the, heart, of, everyone, that, love, the, internet, ,, it, look, like, [, The, DarkNet, Plan, ], (, http://www.reddit.com/r/darknetplan, ), have, grow, by, the, thousand, and, even, get, [, national, medium, attention, ], (, http://www.forbes.com/sites/andygreenberg/2011/11/23/wary-of-sopa-reddit-users-aim-to-build-a-new-censorship-free-internet/, ), ., What, be, the, feasibility, of, do, that, for, our, big, red, campus, ?, Relevant, :, I, miss, DC, +, +]","[ever, since, sopa, put, fear, into, the, heart, of, everyone, that, love, the, internet, it, look, like, the, darknet, plan, have, grow, by, the, thousand, and, even, get, national, medium, attention, what, be, the, feasibility, of, do, that, for, our, big, red, campus, relevant, i, miss, dc]"
...,...,...,...,...
72981,"If your ECs are good, maybe.","[If, your, ECs, are, good, ,, maybe, .]","[If, your, ECs, be, good, ,, maybe, .]","[if, your, ec, be, good, maybe]"
72983,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼ ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ**,"[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩r̸̞̺̪̗̮̭͉ͅį͏̙͇͚͉̜͙͈̬o̤̭̜̝̜̤̟̬r҉͈̱̼, ̨͓̻̜r̠͖̲̮e͇͚̥͙̜̗̕d͔̻͍͟͟ͅ, *, *]","[harvard, be, the]"
72984,Agreed,[Agreed],[Agreed],[agree]
72985,Why did this make me laugh so hard ahaahha,"[Why, did, this, make, me, laugh, so, hard, ahaahha]","[Why, do, this, make, me, laugh, so, hard, ahaahha]","[why, do, this, make, me, laugh, so, hard, ahaahha]"


In [133]:
pd.set_option('display.max_colwidth', None)
df_text.sample(50)

Unnamed: 0,text,tokens,lemmas,final
52133,"Hey, AAP rarely gets to have any fun ^^So ^^pls ^^buy ^^merch","[Hey, ,, AAP, rarely, gets, to, have, any, fun, ^, ^, So, ^, ^, pls, ^, ^, buy, ^, ^, merch]","[Hey, ,, AAP, rarely, get, to, have, any, fun, ^, ^, So, ^, ^, pls, ^, ^, buy, ^, ^, merch]","[hey, aap, rarely, get, to, have, any, fun, so, pls, buy, merch]"
64038,You're certainly not gonna be able to hang one from the ceiling if that's what you're getting at. As far as the ones that are on like a stand (those are a thing right?) You could probably have it but the people in the rooms next to you might complaining about all the banging and noise,"[You're, certainly, not, gonna, be, able, to, hang, one, from, the, ceiling, if, that's, what, you're, getting, at, ., As, far, as, the, ones, that, are, on, like, a, stand, (, those, are, a, thing, right, ?, ), You, could, probably, have, it, but, the, people, in, the, rooms, next, to, you, might, complaining, about, all, the, banging, and, noise]","[You're, certainly, not, gonna, be, able, to, hang, one, from, the, ceiling, if, that's, what, you're, get, at, ., As, far, a, the, one, that, be, on, like, a, stand, (, those, be, a, thing, right, ?, ), You, could, probably, have, it, but, the, people, in, the, room, next, to, you, might, complain, about, all, the, banging, and, noise]","[you're, certainly, not, gonna, be, able, to, hang, one, from, the, ceiling, if, that's, what, you're, get, at, as, far, a, the, one, that, be, on, like, a, stand, those, be, a, thing, right, you, could, probably, have, it, but, the, people, in, the, room, next, to, you, might, complain, about, all, the, banging, and, noise]"
15985,"Searched long and hard for the Loudon book on DC++ last year; couldn't ever find it. Managed to not buy the textbook the whole year by (1) buying an older version for a few bucks (2) using the reserves at the library and (3) taking photographs of the problem sets off of my friends' books.\n\nI remember seeing a lot of other orgo review materials on DC++ though, just no Loudon :(","[Searched, long, and, hard, for, the, Loudon, book, on, DC, +, +, last, year, ;, couldn't, ever, find, it, ., Managed, to, not, buy, the, textbook, the, whole, year, by, (, 1, ), buying, an, older, version, for, a, few, bucks, (, 2, ), using, the, reserves, at, the, library, and, (, 3, ), taking, photographs, of, the, problem, sets, off, of, my, friends, ', books, ., I, remember, seeing, a, lot, of, other, orgo, review, materials, on, DC, +, +, though, ,, just, no, Loudon, :(]","[Searched, long, and, hard, for, the, Loudon, book, on, DC, +, +, last, year, ;, couldn't, ever, find, it, ., Managed, to, not, buy, the, textbook, the, whole, year, by, (, 1, ), buy, an, old, version, for, a, few, buck, (, 2, ), use, the, reserve, at, the, library, and, (, 3, ), take, photograph, of, the, problem, set, off, of, my, friend, ', book, ., I, remember, see, a, lot, of, other, orgo, review, material, on, DC, +, +, though, ,, just, no, Loudon, :(]","[search, long, and, hard, for, the, loudon, book, on, dc, last, year, couldn't, ever, find, it, manage, to, not, buy, the, textbook, the, whole, year, by, buy, an, old, version, for, a, few, buck, use, the, reserve, at, the, library, and, take, photograph, of, the, problem, set, off, of, my, friend, book, i, remember, see, a, lot, of, other, orgo, review, material, on, dc, though, just, no, loudon]"
39915,There's an entrance near cook house on gothics way where you can drive in. Usually you need the code but I'm sure you won't need one from now until classes start. \n\nYou can also park your car between rose and cook if you can't find a parking spot right next to the dorms. There's also parking on West Ave. and you can also park on Libe if there's little room elsewhere.,"[There's, an, entrance, near, cook, house, on, gothics, way, where, you, can, drive, in, ., Usually, you, need, the, code, but, I'm, sure, you, won't, need, one, from, now, until, classes, start, ., You, can, also, park, your, car, between, rose, and, cook, if, you, can't, find, a, parking, spot, right, next, to, the, dorms, ., There's, also, parking, on, West, Ave, ., and, you, can, also, park, on, Libe, if, there's, little, room, elsewhere, .]","[There's, an, entrance, near, cook, house, on, gothic, way, where, you, can, drive, in, ., Usually, you, need, the, code, but, I'm, sure, you, won't, need, one, from, now, until, class, start, ., You, can, also, park, your, car, between, rise, and, cook, if, you, can't, find, a, parking, spot, right, next, to, the, dorm, ., There's, also, park, on, West, Ave, ., and, you, can, also, park, on, Libe, if, there's, little, room, elsewhere, .]","[there's, an, entrance, near, cook, house, on, gothic, way, where, you, can, drive, in, usually, you, need, the, code, but, i'm, sure, you, won't, need, one, from, now, until, class, start, you, can, also, park, your, car, between, rise, and, cook, if, you, can't, find, a, parking, spot, right, next, to, the, dorm, there's, also, park, on, west, ave, and, you, can, also, park, on, libe, if, there's, little, room, elsewhere]"
55165,"Holy shit does Davis teach everything? I had him for waves and again for quantum (3316, didn't take 3317 though). Now it sounds like he teaches 3323 as well!\n\nThat man is amazing. I feel like I really understood physics at a deep level when I had him for waves. Matthias ""Bullet Train"" Liepe was great whenever he substituted for Davis in waves, too. He just went really fast through the material, unfortunately. Not so great for freshman me but the whole course was just so mind-blowing. It's a pity that not all of the profs in the physics department are like that (looking at you, the profs who rotate on 2214).\n\nI also like Davis' stories/history anecdotes in his physics classes. Gives you a greater appreciation and broader context for the study of the field, even if you don't end up going into physics (I certainly didn't).","[Holy, shit, does, Davis, teach, everything, ?, I, had, him, for, waves, and, again, for, quantum, (, 3316, ,, didn't, take, 3317, though, ), ., Now, it, sounds, like, he, teaches, 3323, as, well, !, That, man, is, amazing, ., I, feel, like, I, really, understood, physics, at, a, deep, level, when, I, had, him, for, waves, ., Matthias, "", Bullet, Train, "", Liepe, was, great, whenever, he, substituted, for, Davis, in, waves, ,, too, ., He, just, went, really, fast, through, the, material, ,, unfortunately, ., Not, so, great, for, freshman, me, but, the, whole, course, was, just, so, ...]","[Holy, shit, do, Davis, teach, everything, ?, I, have, him, for, wave, and, again, for, quantum, (, 3316, ,, didn't, take, 3317, though, ), ., Now, it, sound, like, he, teach, 3323, a, well, !, That, man, be, amazing, ., I, feel, like, I, really, understood, physic, at, a, deep, level, when, I, have, him, for, wave, ., Matthias, "", Bullet, Train, "", Liepe, be, great, whenever, he, substitute, for, Davis, in, wave, ,, too, ., He, just, go, really, fast, through, the, material, ,, unfortunately, ., Not, so, great, for, freshman, me, but, the, whole, course, be, just, so, ...]","[holy, shit, do, davis, teach, everything, i, have, him, for, wave, and, again, for, quantum, didn't, take, though, now, it, sound, like, he, teach, as, well, that, man, be, amaze, i, feel, like, i, really, understood, physic, at, a, deep, level, when, i, have, him, for, wave, matthias, bullet, train, liepe, be, great, whenever, he, substitute, for, davis, in, wave, too, he, just, go, really, fast, through, the, material, unfortunately, not, so, great, for, freshman, me, but, the, whole, course, be, just, so, it's, a, pity, that, not, all, of, the, prof, in, the, physic, department, be, like, that, look, ...]"
36877,"Just an opinion, if you're not superb at Spanish and you get placed in 1230 or lower, its not worth going through the Spanish track. Its a lot of homework and double the usual number of exams.","[Just, an, opinion, ,, if, you're, not, superb, at, Spanish, and, you, get, placed, in, 1230, or, lower, ,, its, not, worth, going, through, the, Spanish, track, ., Its, a, lot, of, homework, and, double, the, usual, number, of, exams, .]","[Just, an, opinion, ,, if, you're, not, superb, at, Spanish, and, you, get, place, in, 1230, or, low, ,, it, not, worth, go, through, the, Spanish, track, ., Its, a, lot, of, homework, and, double, the, usual, number, of, exam, .]","[just, an, opinion, if, you're, not, superb, at, spanish, and, you, get, place, in, or, lower, it, not, worth, go, through, the, spanish, track, it, a, lot, of, homework, and, double, the, usual, number, of, exam]"
55994,See Rule 6,"[See, Rule, 6]","[See, Rule, 6]","[see, rule]"
67767,Carpenter,[Carpenter],[Carpenter],[carpenter]
8171,"Just ended my first year here, and I am worried about my grades. I am an engineering CS student who took 1110 first semester and got a B- and ended 1910 with a B. This semester I took 2110, got B- again and in 2800, I am about to get a C+, if lucky a B-. I did get a B+ in 1920, and because I did well in other classes, my GPA isn't too terrible, but the low grades in these CS classes are worrying.\n\nI am looking for advice as I am searching for opportunities to get involved with CS in the campus to build my resume, but my grades are too low to join any project team or clubs.","[Just, ended, my, first, year, here, ,, and, I, am, worried, about, my, grades, ., I, am, an, engineering, CS, student, who, took, 1110, first, semester, and, got, a, B, -, and, ended, 1910, with, a, B, ., This, semester, I, took, 2110, ,, got, B, -, again, and, in, 2800, ,, I, am, about, to, get, a, C, +, ,, if, lucky, a, B, -, ., I, did, get, a, B, +, in, 1920, ,, and, because, I, did, well, in, other, classes, ,, my, GPA, isn't, too, terrible, ,, but, the, low, grades, in, these, CS, classes, are, ...]","[Just, end, my, first, year, here, ,, and, I, be, worried, about, my, grade, ., I, be, an, engineering, CS, student, who, take, 1110, first, semester, and, get, a, B, -, and, end, 1910, with, a, B, ., This, semester, I, take, 2110, ,, get, B, -, again, and, in, 2800, ,, I, be, about, to, get, a, C, +, ,, if, lucky, a, B, -, ., I, do, get, a, B, +, in, 1920, ,, and, because, I, do, well, in, other, class, ,, my, GPA, isn't, too, terrible, ,, but, the, low, grade, in, these, CS, class, be, ...]","[just, end, my, first, year, here, and, i, be, worried, about, my, grade, i, be, an, engineering, c, student, who, take, first, semester, and, get, a, b, and, end, with, a, b, this, semester, i, take, get, b, again, and, in, i, be, about, to, get, a, c, if, lucky, a, b, i, do, get, a, b, in, and, because, i, do, well, in, other, class, my, gpa, isn't, too, terrible, but, the, low, grade, in, these, c, class, be, worry, i, be, look, for, advice, a, i, be, search, for, opportunity, to, get, involve, with, c, in, the, campus, ...]"
71018,"I mean nearly all of these points are critiques the application process itself instead of the actual project team experience once you get in. Considering that the author claims that (s)he was on a team for 3 years, it's a rather telling omission.\n\n&gt; And, in fact, this is the image project teams project, not just to the naive freshmen they prey on but also to the naive high schoolers applying to Cornell and the naive alumni donating to Cornell. Project teams are the darling of the College of Engineering because they lend it an additional sheen of selectivity and prestige.\n\nPeople should know that project teams aren't something unique to Cornell. Most engineering programs in the US have them, and it is to be expected for Cornell to also have them if it wants to be considered a top engineering program.\n\n&gt; The premise of the interviews itself is a joke. What qualifies one student to interview another? Is it the extra two semesters of classes that does it? It is only with a huge helping of arrogance that it is possible to convince oneself of one’s qualification to judge a peer.\n\nWhat do you think will happen once you enter industry? You will get interviewed and evaluated for promotions/raises by more senior engineers. And part of being an upperclassman in a team will be learning how to assess applicants and leadership/management skills. \n\nConsidering that a project team aims to be a microcosm of a professional engineering organization, I only see this as a good thing. Underclassmen get practice on how to be interviewed, and upperclassmen get practice on how to interview.\n\n&gt; How can we seriously cast project teams as some golden, educational force when they admit only a vanishingly small number of those interested?\n\nThe irony of saying this, while also being a student at one of the most selective universities in the country, must be lost on the author.\n\n&gt; The glorification of stress culture has a home like no other in the project team. Working yourself to the bone, fueled by Mattin’s and Mountain Dew, is the norm. A badge of pride is having spent the night in the lab — and leaving early is a mark of weakness. Caught up in this competitive machismo, real intellectualism is lost and intellectual-signaling is what’s left.\n\nNone of this is unique to project teams, or even to the college of engineering as a whole. It's a cultural problem across the whole of Cornell, and pinning it on project teams is unreasonable.\n\nAnd as long as we're only going by anecdotal evidence: As an alum who was on a team from sophmore year to graduation, I can count on one hand the amount of times I've had to work past midnight, and none of those times were project-team related.\n","[I, mean, nearly, all, of, these, points, are, critiques, the, application, process, itself, instead, of, the, actual, project, team, experience, once, you, get, in, ., Considering, that, the, author, claims, that, (, s, ), he, was, on, a, team, for, 3, years, ,, it's, a, rather, telling, omission, ., >, And, ,, in, fact, ,, this, is, the, image, project, teams, project, ,, not, just, to, the, naive, freshmen, they, prey, on, but, also, to, the, naive, high, schoolers, applying, to, Cornell, and, the, naive, alumni, donating, to, Cornell, ., Project, teams, are, the, darling, of, the, College, of, Engineering, ...]","[I, mean, nearly, all, of, these, point, be, critique, the, application, process, itself, instead, of, the, actual, project, team, experience, once, you, get, in, ., Considering, that, the, author, claim, that, (, s, ), he, be, on, a, team, for, 3, year, ,, it's, a, rather, telling, omission, ., >, And, ,, in, fact, ,, this, be, the, image, project, team, project, ,, not, just, to, the, naive, freshman, they, prey, on, but, also, to, the, naive, high, schoolers, apply, to, Cornell, and, the, naive, alumnus, donate, to, Cornell, ., Project, team, be, the, darling, of, the, College, of, Engineering, ...]","[i, mean, nearly, all, of, these, point, be, critique, the, application, process, itself, instead, of, the, actual, project, team, experience, once, you, get, in, consider, that, the, author, claim, that, s, he, be, on, a, team, for, year, it's, a, rather, telling, omission, and, in, fact, this, be, the, image, project, team, project, not, just, to, the, naive, freshman, they, prey, on, but, also, to, the, naive, high, schoolers, apply, to, cornell, and, the, naive, alumnus, donate, to, cornell, project, team, be, the, darling, of, the, college, of, engineering, because, they, lend, it, an, additional, sheen, of, selectivity, and, prestige, ...]"
