## Package Imports

In [25]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk
import random

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import pos_tag
from nltk.corpus import wordnet

# imports specific to lexical cleaning
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter

# imports specific to syntactic cleaning
from nltk import sent_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Format Data

In [26]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [27]:
def corpus_to_df(corpus):
    '''Function to convert the convokit corpus to a pandas dataframe structure.'''

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## DF Level Cleaning

In [83]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)
URL_RE = re.compile(r"(https?://[^\s\)\]\}]+|www\.[^\s\)\]\}]+)")
HAS_LETTER_RE = re.compile(r"[A-Za-z]")

EMOJI_RE = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U00002702-\U000027B0"  # dingbats
    "\U000024C2-\U0001F251" 
    "]+",
    flags=re.UNICODE,
)

## Lexical Cleaning

In [84]:
def clean_text(text):
    '''Helper function to clean text by removing urls and other undesirable features.'''

    # remove urls
    text = URL_RE.sub("", text)

    # remove emojis
    text = EMOJI_RE.sub("", text)

    return text

In [30]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [31]:
def get_wordnet_pos(treebank_tag):
    '''Helper function to map treebank-based POS tags to wordnet POS tags.'''
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    
    # otherwise default to noun
    else:
        return wordnet.NOUN

In [32]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    tagged = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(tok, get_wordnet_pos(tag)) for tok, tag in tagged]

    return lemmatized_tokens

In [33]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation and numbers
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''
    
    # clean text
    text = clean_text(text)

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue

        # only keep alphabetic tokens, including contractions
        if re.fullmatch(r"[A-Za-z]+(?:['‚Äô][A-Za-z]+)*", tok):
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Syntactic Cleaning Functions

In [34]:
def split_sentences(text):
    '''Helper function to split a given utterance into separate sentences'''

    sentence_tokens = sent_tokenize(text)

    return sentence_tokens

In [72]:
def strip_markdown_emphasis(text):
    '''Helper function that remove markdown-style emphasis: *word*, **word**, 
    _word_, __word__'''

    # replace *word* or **word** with word
    text = re.sub(r"(\*{1,2}|_{1,2})(\S.*?\S)\1", r"\2", text)
    
    return text

In [35]:
# define relevant sets of tags and words
FINITE_VERB_TAGS = {"VB", "VBD", "VBN", "VBP", "VBZ"}
SUBJECT_TAGS = {"NN", "NNS", "NNP", "NNPS", "PRP"}
SUBORDINATING_CONJ = {"IN"} # tag for subordinating conjunction
COORDINATING_CONJ = {"CC"} # tag for coordinating conjunction

PUNCT = '?!.({[]})-‚Äì‚Äî"\''
CLOSING_PUNCT = '.!?‚Ä¶'
TRAILING_CLOSERS = set(['"', "'", ')', ']', '}', '‚Äù', '‚Äô'])

# normalize curly quotes and fancy punctuation
FANCY_TO_ASCII = {
                '‚Äú': '"', '‚Äù': '"',
                '‚Äò': "'", '‚Äô': "'",
                '‚Äî': '-', '‚Äì': '-',
                '‚Ä¶': '...'
                }

In [36]:
def is_complete_sentence(sentence):
    '''Helper function to determine whether a sentence is complete. Recall that a complete sentence follows these rules:
    -contains at least one subject 
    -contains at least one finite verb
    -ends with appropriate punctuation (.?!) 
    -if it begins with a subordinator, has an independent clause after
    -does not end with a conjunction
    '''

    cleaned = sentence.strip() # removing trailing/leading whitespace
    # account for differences in straight vs. smart quotes
    for f, a in FANCY_TO_ASCII.items():
        cleaned = cleaned.replace(f, a)
    # remove leading/trailing quotes
    cleaned = cleaned.strip('\"')
    cleaned = cleaned.strip('\'')

    # empty string
    if not cleaned:
        return False
    
    # tokenize sentence and tag tokens
    tokens = tokenize(cleaned)
    tags = pos_tag(tokens)

    # ensure length is appropriate
    if len(tokens) < 2:
        return False

    # first letter should be capital
    j = 0
    while j < len(cleaned) and cleaned[j] in PUNCT:
        j += 1
    if j >= len(cleaned):
        return False
    if not cleaned[j].isalpha() or not cleaned[j].isupper():
        return False
        
    # last relevant char must end with proper punctuation
    i = len(cleaned) - 1
    while i > 0 and cleaned[i] in TRAILING_CLOSERS:
        i -= 1
    if i <= 0 or cleaned[i] not in CLOSING_PUNCT:
        return False
    
    # find the first words tag
    first_word = None
    first_tag = None
    for word, tag in tags:
        if word.isalpha():
            first_word = word
            first_tag = tag
            break
    # if first word is subordinating conjunction (including "when"), need independent clause after
    if first_tag in SUBORDINATING_CONJ or first_word == "When":
        if ',' in tokens: # indepdent clause will start after a comma
            comma_index = tokens.index(',')
            post_sub_tags = tags[comma_index+1:]
            # check if independent clause is a complete thought
            has_finite_verb_post_sub = any(tag in FINITE_VERB_TAGS for _, tag in post_sub_tags)
            has_subject_post_sub = any(tag in SUBJECT_TAGS for _, tag in tags)
            if not (has_finite_verb_post_sub and has_subject_post_sub):
                return False
        # if no comma separating clauses
        else:
            noun_count = sum(1 for _, tag in tags if tag in SUBJECT_TAGS)
            verb_count = sum(1 for _, tag in tags if tag in FINITE_VERB_TAGS)
            # edge case for when first word is if
            if first_word == "If" and verb_count < 2:
                return False
            # check for two nouns, if not assume fragment
            if noun_count < 2:
                return False

    # find the last words tag
    last_tag = None
    for word, tag in reversed(tags):
        if word.isalpha():
            last_tag = tag
            break
    # last word cannot be conjunction
    if last_tag in COORDINATING_CONJ:
        return False

    # check if it has finite verb and subject
    has_finite_verb = any(tag in FINITE_VERB_TAGS for _, tag in tags)
    has_subject = any(tag in SUBJECT_TAGS for _, tag in tags)

    return has_finite_verb and has_subject

In [88]:
def clean_tokens_syntactic(text):

    # replace URLs with "URL" in sentences
    text = URL_RE.sub("URL", text)

    # strip markdown emphasis
    text = strip_markdown_emphasis(text)

    # remove emojis
    text = EMOJI_RE.sub("", text)
    
    # sentence tokenize text
    sentences = split_sentences(text)

    # remove sentences that do not contain a single letter
    sentences = [s for s in sentences if re.search(HAS_LETTER_RE, s)]

    return sentences

## Preprocess Functions

In [55]:
def filter_df(df):
    '''Helper function that pre-processes a dataframe containing textual social 
    media data by removing deleted/removed utterances, bot utterances, and 
    utterances not containing letters.'''

    # remove deleted/removed utterances
    df = df[~df["text"].str.lower().isin({"[deleted]", "[removed]"})]

    # remove bot authored utterances
    df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]

    # remove utterances without a letter
    df = df[df["text"].str.contains(HAS_LETTER_RE, regex=True)]

    return df


In [38]:
def lexical_preprocessing_df(df):
    '''Function that pre-processes the data in a given dataframe by removing
    deleted/removed utterances, bot utterances, and utterances not containing letters.
    Then, the remaining textual data is tokenized, lemmatized, and cleaned for lexical 
    analysis.'''
    
    # filter utterances
    df = filter_df(df)

    # final tokenized, lemmatized, and cleaned set
    df["final"] = df["text"].apply(clean_tokens_lexical)

    return df

In [61]:
def syntactic_preprocessing_df(df):
    '''Function that pre-processes the data in a given dataframe by removing
    deleted/removed utterances, bot utterances, and utterances not containing letters.
    Then, the remaining textual data is tokenized and cleaned for syntactic analysis.'''

    # filter utterances
    df = filter_df(df)

    # final tokenized and cleaned set
    df["final"] = df["text"].apply(clean_tokens_syntactic)

    return df

## Lexical Testing

In [40]:
pd.set_option('display.max_rows', 100)

In [41]:
df = corpus_to_df(corpus)
df_lexical = lexical_preprocessing_df(df)

df_lexical

  df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]


Unnamed: 0,utterance_id,speaker_id,text,timestamp,final
0,nyx4d,reddmau5,I was just reading about the Princeton Mic-Che...,2012-01-01 16:18:18,"[i, be, just, read, about, the, princeton, and..."
1,o0145,shtylman,I have added support for Cornell to courseoff....,2012-01-02 13:57:15,"[i, have, add, support, for, cornell, to, cour..."
2,o1gca,moon_river,"i don't have a facebook, so we'd need a volunt...",2012-01-03 14:55:06,"[i, don't, have, a, facebook, so, we'd, need, ..."
3,o0ss4,moon_river,"so, i'm starting to mess with some of the css ...",2012-01-03 01:16:17,"[so, i'm, start, to, mess, with, some, of, the..."
4,o4ipd,reddmau5,Ever since SOPA put fear into the hearts of ev...,2012-01-05 17:08:06,"[ever, since, sopa, put, fear, into, the, hear..."
...,...,...,...,...,...
72981,e8tj9ig,JCsurfing,"If your ECs are good, maybe.",2018-10-31 19:39:29,"[if, your, ec, be, good, maybe]"
72983,e8tjyg1,ultimatefishlover,harvard is the **iÕüÃºÕïÃªÕìÃ©ÃònÕôÃûÃôfÕûÃ∂Ã¢ÃôÃªÃ∫ÕçÃüeÕÄÃ∂ÕöÕôÃ≥Ã©...,2018-10-31 19:50:51,"[harvard, be, the]"
72984,e8tkb66,KickAssEmployee,Agreed,2018-10-31 19:56:45,[agree]
72985,e8tkctl,dasfsadf123,Why did this make me laugh so hard ahaahha,2018-10-31 19:57:30,"[why, do, this, make, me, laugh, so, hard, aha..."


In [42]:
df_text = df_lexical[["text", "final"]]
df_text

Unnamed: 0,text,final
0,I was just reading about the Princeton Mic-Che...,"[i, be, just, read, about, the, princeton, and..."
1,I have added support for Cornell to courseoff....,"[i, have, add, support, for, cornell, to, cour..."
2,"i don't have a facebook, so we'd need a volunt...","[i, don't, have, a, facebook, so, we'd, need, ..."
3,"so, i'm starting to mess with some of the css ...","[so, i'm, start, to, mess, with, some, of, the..."
4,Ever since SOPA put fear into the hearts of ev...,"[ever, since, sopa, put, fear, into, the, hear..."
...,...,...
72981,"If your ECs are good, maybe.","[if, your, ec, be, good, maybe]"
72983,harvard is the **iÕüÃºÕïÃªÕìÃ©ÃònÕôÃûÃôfÕûÃ∂Ã¢ÃôÃªÃ∫ÕçÃüeÕÄÃ∂ÕöÕôÃ≥Ã©...,"[harvard, be, the]"
72984,Agreed,[agree]
72985,Why did this make me laugh so hard ahaahha,"[why, do, this, make, me, laugh, so, hard, aha..."


In [43]:
pd.set_option('display.max_colwidth', None)
df_text.sample(50)

Unnamed: 0,text,final
15663,"It is going to sound really cliche, but choose one that you will actually be interested in. One of mine ended up being almost painful to go to and since much of the grade was attendance/participation based I had to go.\n\nJust make sure it fits into you schedule. (Even though they are really easy to add/drop)","[it, be, go, to, sound, really, cliche, but, choose, one, that, you, will, actually, be, interested, in, one, of, mine, end, up, be, almost, painful, to, go, to, and, since, much, of, the, grade, be, attendance, participation, base, i, have, to, go, just, make, sure, it, fit, into, you, schedule, even, though, they, be, really, easy, to, add, drop]"
14794,"Mt Crested Butte, CO, skiing with my friend for the week!","[mt, crest, butte, co, ski, with, my, friend, for, the, week]"
1775,"My friend and I are looking to cross this of the 161; we went to the Uris stacks but there's no secluded spot where you're not very exposed. Is there a spot people recommend on campus, maybe somewhere in Olin?","[my, friend, and, i, be, look, to, cross, this, of, the, we, go, to, the, uris, stack, but, there's, no, secluded, spot, where, you're, not, very, expose, be, there, a, spot, people, recommend, on, campus, maybe, somewhere, in, olin]"
43543,Sorry I don‚Äôt know... maybe you could have a try during pre enroll to see if there‚Äôs any limitation,"[sorry, i, don, t, know, maybe, you, could, have, a, try, during, pre, enroll, to, see, if, there, s, any, limitation]"
19276,"President of CPS (Cornell Photo Society) here. first of all sign up for our [email list here!](https://orgsync.com/join/73399/photo-society-cornell)\n\n\nIn regards to equipment rental; when /u/jmabeshaus was EManager (which was probably a while ago) the EQ must have been relatively new then. We dont exactly have cameras to give you(funding hiccup), Canon lenses;yes. If you really need a camera tho, PM me and i could lend you mine.","[president, of, cps, cornell, photo, society, here, first, of, all, sign, up, for, our, email, list, here, in, regard, to, equipment, rental, when, u, jmabeshaus, be, emanager, which, be, probably, a, while, ago, the, eq, must, have, be, relatively, new, then, we, dont, exactly, have, camera, to, give, you, fund, hiccup, canon, lenses, yes, if, you, really, need, a, camera, tho, pm, me, and, i, could, lend, you, mine]"
9150,"I'm an engineer enrolled in NS 1150 right now and have a prelim coming up. I have more important classes to study for and I don't want to let NS1150 affect my grade. However I need to stay above the 12 credit mark, and NS 1150 keeps me at 14.\n\nIs it too late to change it to S/U? Also if I do change it to S/U, would it look bad on my transcript?","[i'm, an, engineer, enrol, in, ns, right, now, and, have, a, prelim, come, up, i, have, more, important, class, to, study, for, and, i, don't, want, to, let, affect, my, grade, however, i, need, to, stay, above, the, credit, mark, and, ns, keep, me, at, be, it, too, late, to, change, it, to, s, u, also, if, i, do, change, it, to, s, u, would, it, look, bad, on, my, transcript]"
23828,"Ooh, what encouraging responses, lol. Well, I found a bus to where I needed to go. Thanks guys.","[ooh, what, encouraging, response, lol, well, i, find, a, bus, to, where, i, need, to, go, thanks, guy]"
70849,"Everything requires 2110 as a prereq, except 2800. Take it anyways if you have a strong grasp of python. If you want to get up to speed in the field it's expected you'll have to do some things like this and read ahead since you're 2 years behind. Try and find the classes that you can do in python or whatever languages you know or are willing to learn.","[everything, require, a, a, prereq, except, take, it, anyways, if, you, have, a, strong, grasp, of, python, if, you, want, to, get, up, to, speed, in, the, field, it's, expect, you'll, have, to, do, some, thing, like, this, and, read, ahead, since, you're, year, behind, try, and, find, the, class, that, you, can, do, in, python, or, whatever, languages, you, know, or, be, willing, to, learn]"
10602,"I began today at 7 and finished at 9:45, this ""JTF"" thing doesn't sound too bad...","[i, begin, today, at, and, finish, at, this, jtf, thing, doesn't, sound, too, bad]"
30737,"Okay, but what about systemic issues that guarantee that minorities generally have worse stats? Are minorities just supposed to never be able to attend elite institutions? Some policy has to be taken somehow. Affirmative action has flaws, but it's better than nothing. Unless you have a better idea, no sense in complaining.","[okay, but, what, about, systemic, issue, that, guarantee, that, minority, generally, have, bad, stats, be, minority, just, suppose, to, never, be, able, to, attend, elite, institution, some, policy, have, to, be, take, somehow, affirmative, action, have, flaw, but, it's, good, than, nothing, unless, you, have, a, good, idea, no, sense, in, complain]"


## Syntactic Testing

In [73]:
df = corpus_to_df(corpus)

In [89]:
df_syntactic = syntactic_preprocessing_df(df)

df_text_syn = df_syntactic[["text", "final"]]
pd.set_option('display.max_colwidth', None)
df_text_syn.sample(100)

  df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]


Unnamed: 0,text,final
16399,Obviously this guy was at Barton Hall watching Kesha.,[Obviously this guy was at Barton Hall watching Kesha.]
50446,It‚Äôs normally there during the warmer months. I saw it last semester a lot in August and September.,"[It‚Äôs normally there during the warmer months., I saw it last semester a lot in August and September.]"
35552,"http://as.cornell.edu/2015-2016-credit-and-placement-students-college-0\n\nI'm in CALS but my AP Calc score allowed me to get 4 credits in Calculus I and if I chose to, I could take Calculus II the first semester but didn't have to because my math requirement was filled. Also, the distribution requirements for CAS is different but for CALS I placed out of all but one of the liberal arts classes.","[URL\n\nI'm in CALS but my AP Calc score allowed me to get 4 credits in Calculus I and if I chose to, I could take Calculus II the first semester but didn't have to because my math requirement was filled., Also, the distribution requirements for CAS is different but for CALS I placed out of all but one of the liberal arts classes.]"
62836,Fuck. Its been a long day,"[Fuck., Its been a long day]"
9227,like there‚Äôs 39 people on this subreddit rn where y‚Äôall at,[like there‚Äôs 39 people on this subreddit rn where y‚Äôall at]
14214,"It was about $90 round trip from what I remember, so probably something like $50 one way. I believe there's a student discount.","[It was about $90 round trip from what I remember, so probably something like $50 one way., I believe there's a student discount.]"
19376,Grad students aren't happy anywhere.,[Grad students aren't happy anywhere.]
14028,"Graduated with a ChemE here and now a grad student at Cornell doing ChemE like research. ChemEs do not really do chemistry. You will have to take some chemistries courses like Orgo and physical chemistry but that is about it. Your job is to design the process that carries out the chemical reaction in bulk. It is one of the highest paying majors out there and very well respected because you know a lot of shit.\n\nDo the ChemE major and business minor is my best suggestion. You will learn basic economics in your ChemE classes that deal with design because you will have to do cost analysis of your processes. My senior design class was taught by head of the department and he taught us some econ in the class and always ripped on the business majors because that one semester taught us what they learned in several years.\n\nI have friends right now getting their MBA after graduating with a ChemE major (some did not even have a business minor). So really, get the ChemE major, maybe take some business classes, and then when you graduate a get a job, get your MBA while working. Safest, least debt, high paying route possible\n\ncheers!","[Graduated with a ChemE here and now a grad student at Cornell doing ChemE like research., ChemEs do not really do chemistry., You will have to take some chemistries courses like Orgo and physical chemistry but that is about it., Your job is to design the process that carries out the chemical reaction in bulk., It is one of the highest paying majors out there and very well respected because you know a lot of shit., Do the ChemE major and business minor is my best suggestion., You will learn basic economics in your ChemE classes that deal with design because you will have to do cost analysis of your processes., My senior design class was taught by head of the department and he taught us some econ in the class and always ripped on the business majors because that one semester taught us what they learned in several years., I have friends right now getting their MBA after graduating with a ChemE major (some did not even have a business minor)., So really, get the ChemE major, maybe take some business classes, and then when you graduate a get a job, get your MBA while working., Safest, least debt, high paying route possible\n\ncheers!]"
38092,The guy is joking. Take 2230-2240 and if you like it then take 4330 later.,"[The guy is joking., Take 2230-2240 and if you like it then take 4330 later.]"
25646,"At one time, most folks on this subreddit were HS juniors/seniors investigating Cornell and other colleges. It's a huge transition, and does tend to make people very anxious.\n\nI suppose it can be annoying to be bombarded with WAMC postings. On the other hand, those of us who went through the process already, kind of owe one to the newbies. Think of it as the first contact from people who will eventually calm down and be future community members. How would you have wanted to be treated, back when you were in that position?\n\nOf course, the questions are usually about the same, and maybe a FAQ on the right bar would help. Often, people just need to be reassured (?) that admission is mostly random, and there is only so much you can do to get in to any given school.","[At one time, most folks on this subreddit were HS juniors/seniors investigating Cornell and other colleges., It's a huge transition, and does tend to make people very anxious., I suppose it can be annoying to be bombarded with WAMC postings., On the other hand, those of us who went through the process already, kind of owe one to the newbies., Think of it as the first contact from people who will eventually calm down and be future community members., How would you have wanted to be treated, back when you were in that position?, Of course, the questions are usually about the same, and maybe a FAQ on the right bar would help., Often, people just need to be reassured (?), that admission is mostly random, and there is only so much you can do to get in to any given school.]"
