## Package Imports

In [2]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


## Cleaning

In [10]:
REDDIT_PLACEHOLDERS = {"[deleted]", "[removed]", "deleted", "removed"}

In [11]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    # handle empty text input
    if text is None:
        return []
    
    # return empty list if text is a deleted or removed post
    if text in REDDIT_PLACEHOLDERS:
        return []


    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [4]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

In [5]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation, numbers, and emojis
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue
        # skip over emojis

        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Testing

In [7]:
import random

In [8]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 20)

In [15]:
rows = []

for utt in sample_utts:
    text = utt.text
    rows.append({
        "raw_text": text,
        "tokens": tokenize(text),
        "lemmatized": lemmatize(tokenize(text)),
        "cleaned": clean_tokens_lexical(text)
    })

df = pd.DataFrame(rows)
df

Unnamed: 0,raw_text,tokens,lemmatized,cleaned
0,"Called earlier, they just told me to post the ...","[Called, earlier, ,, they, just, told, me, to,...","[Called, earlier, ,, they, just, told, me, to,...","[called, earlier, they, just, told, me, to, po..."
1,It's if you don't pass (in your case you passe...,"[It's, if, you, don't, pass, (, in, your, case...","[It's, if, you, don't, pas, (, in, your, case,...","[if, you, pas, in, your, case, you, passed, th..."
2,Another difficulty is adjusting to the fact th...,"[Another, difficulty, is, adjusting, to, the, ...","[Another, difficulty, is, adjusting, to, the, ...","[another, difficulty, is, adjusting, to, the, ..."
3,I assume this is in reference to racist commen...,"[I, assume, this, is, in, reference, to, racis...","[I, assume, this, is, in, reference, to, racis...","[i, assume, this, is, in, reference, to, racis..."
4,"yeah, unfortunately i'm not sure how to check ...","[yeah, ,, unfortunately, i'm, not, sure, how, ...","[yeah, ,, unfortunately, i'm, not, sure, how, ...","[yeah, unfortunately, not, sure, how, to, chec..."
5,"Lol yes many engineers, including myself, had ...","[Lol, yes, many, engineers, ,, including, myse...","[Lol, yes, many, engineer, ,, including, mysel...","[lol, yes, many, engineer, including, myself, ..."
6,\+1,"[\, +, 1]","[\, +, 1]",[]
7,"Valid point, but I have read that to be even c...","[Valid, point, ,, but, I, have, read, that, to...","[Valid, point, ,, but, I, have, read, that, to...","[valid, point, but, i, have, read, that, to, b..."
8,I liked Gia's Elite at Triphammer Mall (via TC...,"[I, liked, Gia's, Elite, at, Triphammer, Mall,...","[I, liked, Gia's, Elite, at, Triphammer, Mall,...","[i, liked, elite, at, triphammer, mall, via, t..."
9,[deleted],[],[],[]
