## Package Imports

In [5]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import pos_tag
from nltk.corpus import wordnet

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  import pkg_resources


## Format Data

In [6]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [7]:
def corpus_to_df(corpus):
    '''Function to convert the convokit corpus to a pandas dataframe structure.'''

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## DF Level Cleaning

In [8]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)
URL_RE = re.compile(r"http\S+|www\.\S+")
HAS_LETTER_RE = re.compile(r"[A-Za-z]")

## Cleaning

In [9]:
def clean_text(text):
    '''Helper function to clean text by removing urls and other undesirable features.'''

    # remove urls
    text = URL_RE.sub("", text)

    return text

In [10]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [11]:
def get_wordnet_pos(treebank_tag):
    '''Helper function to map treebank-based POS tags to wordnet POS tags.'''
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    
    # otherwise default to noun
    else:
        return wordnet.NOUN

In [12]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    tagged = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(tok, get_wordnet_pos(tag)) for tok, tag in tagged]

    return lemmatized_tokens

In [13]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation and numbers
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''
    
    # clean text
    text = clean_text(text)

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue

        # only keep alphabetic tokens, including contractions
        if re.fullmatch(r"[A-Za-z]+(?:['’][A-Za-z]+)*", tok):
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Preprocess Function

In [14]:
def preprocess_df(df):
    '''Function that pre-processes the data in a given dataframe by removing
    deleted/removed utterances, bot utterances, and utterances not containing letters.
    Then, the remaining textual data is tokenized, lemmatized, and cleaned.'''
    
    # remove deleted/removed utterances
    df = df[~df["text"].str.lower().isin({"[deleted]", "[removed]"})]

    # remove bot authored utterances
    df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]

    # remove utterances without a letter
    df = df[df["text"].str.contains(HAS_LETTER_RE, regex=True)]

    # final tokenized, lemmatized, and cleaned set
    df["final"] = df["text"].apply(clean_tokens_lexical)

    return df

## Testing

In [15]:
import random

In [16]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 100)

In [17]:
pd.set_option('display.max_rows', 100)

In [18]:
df = corpus_to_df(corpus)
df = preprocess_df(df)

df

  df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]


Unnamed: 0,utterance_id,speaker_id,text,timestamp,final
0,nyx4d,reddmau5,I was just reading about the Princeton Mic-Che...,2012-01-01 16:18:18,"[i, be, just, read, about, the, princeton, and..."
1,o0145,shtylman,I have added support for Cornell to courseoff....,2012-01-02 13:57:15,"[i, have, add, support, for, cornell, to, cour..."
2,o1gca,moon_river,"i don't have a facebook, so we'd need a volunt...",2012-01-03 14:55:06,"[i, don't, have, a, facebook, so, we'd, need, ..."
3,o0ss4,moon_river,"so, i'm starting to mess with some of the css ...",2012-01-03 01:16:17,"[so, i'm, start, to, mess, with, some, of, the..."
4,o4ipd,reddmau5,Ever since SOPA put fear into the hearts of ev...,2012-01-05 17:08:06,"[ever, since, sopa, put, fear, into, the, hear..."
...,...,...,...,...,...
72981,e8tj9ig,JCsurfing,"If your ECs are good, maybe.",2018-10-31 19:39:29,"[if, your, ec, be, good, maybe]"
72983,e8tjyg1,ultimatefishlover,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩...,2018-10-31 19:50:51,"[harvard, be, the]"
72984,e8tkb66,KickAssEmployee,Agreed,2018-10-31 19:56:45,[agree]
72985,e8tkctl,dasfsadf123,Why did this make me laugh so hard ahaahha,2018-10-31 19:57:30,"[why, do, this, make, me, laugh, so, hard, aha..."


In [20]:
df_text = df[["text", "final"]]
df_text

Unnamed: 0,text,final
0,I was just reading about the Princeton Mic-Che...,"[i, be, just, read, about, the, princeton, and..."
1,I have added support for Cornell to courseoff....,"[i, have, add, support, for, cornell, to, cour..."
2,"i don't have a facebook, so we'd need a volunt...","[i, don't, have, a, facebook, so, we'd, need, ..."
3,"so, i'm starting to mess with some of the css ...","[so, i'm, start, to, mess, with, some, of, the..."
4,Ever since SOPA put fear into the hearts of ev...,"[ever, since, sopa, put, fear, into, the, hear..."
...,...,...
72981,"If your ECs are good, maybe.","[if, your, ec, be, good, maybe]"
72983,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩...,"[harvard, be, the]"
72984,Agreed,[agree]
72985,Why did this make me laugh so hard ahaahha,"[why, do, this, make, me, laugh, so, hard, aha..."


In [21]:
pd.set_option('display.max_colwidth', None)
df_text.sample(50)

Unnamed: 0,text,final
40076,"During o-week, parties are pretty open so you can find a group of freshman and walk around college town and some parties will just let you guys in. It's not until a little later that parties will become more exclusive.","[during, party, be, pretty, open, so, you, can, find, a, group, of, freshman, and, walk, around, college, town, and, some, party, will, just, let, you, guy, in, it's, not, until, a, little, later, that, party, will, become, more, exclusive]"
25333,"Really? Thanks for the advice! \n\nI thought Collegetown would be a great choice because I kinda like the youthful environment, that and the ""cool places to eat and have drinks"" thing I mentioned before\n\nOn the other hand, I do need to study and all of that noise would be nasty.\n\nWhere would you recommend me to live? Maybe slightly west of Collegetown, perhaps? Or fall creek, I don't know.","[really, thanks, for, the, advice, i, think, collegetown, would, be, a, great, choice, because, i, kinda, like, the, youthful, environment, that, and, the, cool, place, to, eat, and, have, drink, thing, i, mention, before, on, the, other, hand, i, do, need, to, study, and, all, of, that, noise, would, be, nasty, where, would, you, recommend, me, to, live, maybe, slightly, west, of, collegetown, perhaps, or, fall, creek, i, don't, know]"
28735,Ithaca has become the new Detroit with all the stabbings this year.,"[ithaca, have, become, the, new, detroit, with, all, the, stabbings, this, year]"
50293,"If the math library is open, you can enter from the Bailey Hall entrance.\n\nBut yes.","[if, the, math, library, be, open, you, can, enter, from, the, bailey, hall, entrance, but, yes]"
32900,What's the closest place to Baker to get it?,"[what's, the, close, place, to, baker, to, get, it]"
28112,"You could sit in or talk to the professor (I think it meets at the same time as 2110), but you'll want to do it soon, because I think the add deadline is 6 September. (You can still add by petition afterwards, IIRC.)","[you, could, sit, in, or, talk, to, the, professor, i, think, it, meet, at, the, same, time, a, but, you'll, want, to, do, it, soon, because, i, think, the, add, deadline, be, september, you, can, still, add, by, petition, afterwards, iirc]"
33784,"Otherwise than the college requirements, there is no difference. The major is exactly the same in both colleges. It is also very easy to switch between, from what I heard","[otherwise, than, the, college, requirement, there, be, no, difference, the, major, be, exactly, the, same, in, both, college, it, be, also, very, easy, to, switch, between, from, what, i, heard]"
35646,Hey whats up,"[hey, whats, up]"
65026,"stop being introverted, for like a moment. You just need to meet people which is the biggest hump. Afterwards hang out 1-1 and be as introverted as you want.","[stop, be, introvert, for, like, a, moment, you, just, need, to, meet, people, which, be, the, big, hump, afterwards, hang, out, and, be, as, introvert, a, you, want]"
25284,"The ones in Teagle are open, so no privacy. I'm not sure about Helen Newman or any other place.","[the, one, in, teagle, be, open, so, no, privacy, i'm, not, sure, about, helen, newman, or, any, other, place]"
