## Package Imports

In [2]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Format Data

In [6]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [67]:
def corpus_to_df(corpus):

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## DF Level Cleaning

In [76]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)
URL_RE = re.compile(r"http\S+|www\.\S+")
HAS_LETTER_RE = re.compile(r"[A-Za-z]")

## Cleaning

In [79]:
def clean_text_lexical(text):

    # remove emojis

    # remove urls
    text = URL_RE.sub("", text)

    return text

In [17]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [4]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

In [5]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation, numbers, and emojis
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue
        # skip over emojis

        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Preprocess Function

In [81]:
def preprocess_df(df):
    # remove deleted/removed utterances
    df = df[~df["text"].str.lower().isin({"[deleted]", "[removed]"})]

    # remove bot authored utterances
    df = df[~df["text"].str.contains(BOT_TEXT_RE)]

    # remove utterances without a letter
    df = df[df["text"].str.contains(HAS_LETTER_RE)]

    # clean text
    df["clean_text"] = df["text"].apply(clean_text_lexical)

    # tokenize
    df["tokens"] = df["clean_text"].apply(tokenize)

    # lemmatize
    df["lemmas"] = df["tokens"].apply(lemmatize)

    return df

## Testing

In [7]:
import random

In [31]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 100)

In [33]:
pd.set_option('display.max_rows', 100)

In [82]:
df = corpus_to_df(corpus)
df = preprocess_df(df)

df

  df = df[~df["text"].str.contains(BOT_TEXT_RE)]


Unnamed: 0,utterance_id,speaker_id,text,timestamp,clean_text,tokens,lemmas
0,nyx4d,reddmau5,I was just reading about the Princeton Mic-Che...,2012-01-01 16:18:18,I was just reading about the Princeton Mic-Che...,"[I, was, just, reading, about, the, Princeton,...","[I, wa, just, reading, about, the, Princeton, ..."
1,o0145,shtylman,I have added support for Cornell to courseoff....,2012-01-02 13:57:15,I have added support for Cornell to courseoff....,"[I, have, added, support, for, Cornell, to, co...","[I, have, added, support, for, Cornell, to, co..."
2,o1gca,moon_river,"i don't have a facebook, so we'd need a volunt...",2012-01-03 14:55:06,"i don't have a facebook, so we'd need a volunt...","[i, don't, have, a, facebook, ,, so, we'd, nee...","[i, don't, have, a, facebook, ,, so, we'd, nee..."
3,o0ss4,moon_river,"so, i'm starting to mess with some of the css ...",2012-01-03 01:16:17,"so, i'm starting to mess with some of the css ...","[so, ,, i'm, starting, to, mess, with, some, o...","[so, ,, i'm, starting, to, mess, with, some, o..."
4,o4ipd,reddmau5,Ever since SOPA put fear into the hearts of ev...,2012-01-05 17:08:06,Ever since SOPA put fear into the hearts of ev...,"[Ever, since, SOPA, put, fear, into, the, hear...","[Ever, since, SOPA, put, fear, into, the, hear..."
...,...,...,...,...,...,...,...
72981,e8tj9ig,JCsurfing,"If your ECs are good, maybe.",2018-10-31 19:39:29,"If your ECs are good, maybe.","[If, your, ECs, are, good, ,, maybe, .]","[If, your, ECs, are, good, ,, maybe, .]"
72983,e8tjyg1,ultimatefishlover,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩...,2018-10-31 19:50:51,harvard is the **i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞è̶͚͙̳̩...,"[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞...","[harvard, is, the, *, *, i̼͕̻͓̩̘͟n͙̞̙f̶̢̙̻̺͍̟͞..."
72984,e8tkb66,KickAssEmployee,Agreed,2018-10-31 19:56:45,Agreed,[Agreed],[Agreed]
72985,e8tkctl,dasfsadf123,Why did this make me laugh so hard ahaahha,2018-10-31 19:57:30,Why did this make me laugh so hard ahaahha,"[Why, did, this, make, me, laugh, so, hard, ah...","[Why, did, this, make, me, laugh, so, hard, ah..."
