## Package Imports

In [2]:
# general imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld
from collections import Counter


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Format Data

In [6]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [67]:
def corpus_to_df(corpus):

    data = []
    for utt in corpus.iter_utterances():
        # only consider utterances with timestamps and text
        if hasattr(utt, "timestamp") and utt.text:
            # convert timestamp from seconds since 1/1/1970 to datetime
            t = datetime.fromtimestamp(int(utt.timestamp))

            data.append({
                "utterance_id": utt.id,
                "speaker_id": utt.speaker.id,
                "text": utt.text,
                "timestamp": t
            })

    df = pd.DataFrame(data)
    return df

## Corpus Level Cleaning

In [40]:
BOT_TEXT_PATTERNS = [
    r"\bi am a bot\b",
    r"\bthis (comment|post) was (posted|left by) a bot",
    r"\bthis reply was generated automatically",
    r"[\^*]*beep(?:\s+beep)?[\^*]*\s+[\^*]*boop(?:\s+boop)?[\^*]*"
]

BOT_TEXT_RE = re.compile("|".join(BOT_TEXT_PATTERNS), flags=re.IGNORECASE)

In [46]:
def filter_deleted_utterances(corpus):
    for utt in corpus.iter_utterances():
        text = utt.text
        if text in {"[deleted]", "[removed]"}:
            corpus.delete_utterance(utt.id)

In [45]:
def filter_bot_utterances(corpus):
    for utt in corpus.iter_utterances():
        text = utt.text
        if BOT_TEXT_RE.search(text):
            corpus.delete_utterance(utt.id)

In [47]:
filter_deleted_utterances(corpus)
filter_bot_utterances(corpus)

AttributeError: 'Corpus' object has no attribute 'delete_utterance'

## Cleaning

In [27]:
REDDIT_PLACEHOLDERS = {"[deleted]", "[removed]", "deleted", "removed"}
_HAS_ALPHANUM = re.compile(r"[A-Za-z]")

BOTS = {}


In [None]:
def clean_text(text):
    '''Helper function to remove deleted/removed utterances, those authored by bots, urls'''


    # remove deleted/removed utterances


    # removes emojis


    # remove URLs


    # remove utterances posted by bots


    return False

In [17]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''

    # handle empty text input
    if text is None:
        return []
    
    # return empty list if text is a deleted or removed post
    if text in REDDIT_PLACEHOLDERS:
        return []
    
    # return empty list if text is purely punctuation or symbols
    if not _HAS_ALPHANUM.search(text):
        return []


    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [4]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

In [5]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation, numbers, and emojis
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue
        # skip over emojis

        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Testing

In [7]:
import random

In [31]:
utterance = list(corpus.iter_utterances())
sample_utts = random.sample(utterance, 100)

In [33]:
pd.set_option('display.max_rows', 100)

In [34]:
rows = []

for utt in sample_utts:
    text = utt.text
    rows.append({
        "raw_text": text,
        "tokens": tokenize(text),
        "lemmatized": lemmatize(tokenize(text)),
        "cleaned": clean_tokens_lexical(text)
    })

df = pd.DataFrame(rows)
df

Unnamed: 0,raw_text,tokens,lemmatized,cleaned
0,Welcome to the family. Do you know which major?,"[Welcome, to, the, family, ., Do, you, know, w...","[Welcome, to, the, family, ., Do, you, know, w...","[welcome, to, the, family, do, you, know, whic..."
1,The only person that can give you access to yo...,"[The, only, person, that, can, give, you, acce...","[The, only, person, that, can, give, you, acce...","[the, only, person, that, can, give, you, acce..."
2,[http://courses.cornell.edu/content.php?catoid...,"[[, http://courses.cornell.edu/content.php?cat...","[[, http://courses.cornell.edu/content.php?cat...","[here, tl, r, they, count, for, your, gpa, at,..."
3,Any pros/cons to living in an apartment buildi...,"[Any, pros, /, cons, to, living, in, an, apart...","[Any, pro, /, con, to, living, in, an, apartme...","[any, pro, con, to, living, in, an, apartment,..."
4,[deleted],[],[],[]
5,I'm an incoming engineering student interested...,"[I'm, an, incoming, engineering, student, inte...","[I'm, an, incoming, engineering, student, inte...","[an, incoming, engineering, student, intereste..."
6,If you are a CS major then talk to the Departm...,"[If, you, are, a, CS, major, then, talk, to, t...","[If, you, are, a, CS, major, then, talk, to, t...","[if, you, are, a, c, major, then, talk, to, th..."
7,"AEP Major here (engineering physics in CoE), s...","[AEP, Major, here, (, engineering, physics, in...","[AEP, Major, here, (, engineering, physic, in,...","[aep, major, here, engineering, physic, in, co..."
8,"Yes, it is possible. I got in with an even low...","[Yes, ,, it, is, possible, ., I, got, in, with...","[Yes, ,, it, is, possible, ., I, got, in, with...","[yes, it, is, possible, i, got, in, with, an, ..."
9,Genuinely not trying to begrudge you.. BUT REA...,"[Genuinely, not, trying, to, begrudge, you, .....","[Genuinely, not, trying, to, begrudge, you, .....","[genuinely, not, trying, to, begrudge, you, bu..."


In [37]:
with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None
):
    display(df.loc[2])

raw_text                          [http://courses.cornell.edu/content.php?catoid=26&amp;navoid=6728](here)\n\nTL;DR they don't count for your GPA at all.
tokens        [[, http://courses.cornell.edu/content.php?catoid=26&navoid=6728, ], (, here, ), TL, ;D, R, they, don't, count, for, your, GPA, at, all, .]
lemmatized    [[, http://courses.cornell.edu/content.php?catoid=26&navoid=6728, ], (, here, ), TL, ;D, R, they, don't, count, for, your, GPA, at, all, .]
cleaned                                                                                               [here, tl, r, they, count, for, your, gpa, at, all]
Name: 2, dtype: object