## Package Imports and Set-Ups

In [14]:
import re
import numpy as np
import pandas as pd
from wordfreq import zipf_frequency
from convokit import Corpus, download
from tqdm import tqdm
import matplotlib.pyplot as plt

# set up nltk tokenizers
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Helper Functions

In [6]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions'''
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [7]:
def clean_lexical_tokens(tokens):
    '''Helper function to clean tokens by removing punctuation, numbers, and emojis
    for purely lexical analysis.'''

    cleaned = []

    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue
        # skip over emojis
        # if tok.encode()
        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    return cleaned

## Lexical Analysis Functions

In [8]:
def ttr(text):
    '''Function that returns the type-token ratio'''

    tokens = tokenize(text)
    tokens = clean_lexical_tokens(tokens)

    # error handling for when there are no tokens
    if len(tokens) == 0:
        return 0.0
    
    # recall that TTR is number of unique words / number of words
    num_types = len(set(tokens))
    num_tokens = len(tokens)
    ttr = num_types / num_tokens

    return ttr

In [9]:
def avg_word_length(text):
    '''Function that determines the average word length of a given text'''
    
    tokens = tokenize(text)
    tokens = clean_lexical_tokens(tokens)

    # error handling for when there are no tokens
    if len(tokens) == 0:
        return 0.0
    
    average_length = np.mean([len(word) for word in tokens])

    return average_length

In [10]:
# build aoa_dict: word -> average age of acquisition
aoa_df = pd.read_csv("Data/KupermanAoAData.csv")
aoa_dict = dict(zip(aoa_df["word"], aoa_df["rating_mean"]))

def aoa_score(text, aoa_dict):
    '''Returns the average age of acquisition score for a given text'''
    
    tokens = tokenize(text)
    tokens = clean_lexical_tokens(tokens)
    aoa_values = [aoa_dict[word] for word in tokens if word in aoa_dict]

    # if there are no words, return a default value
    if len(aoa_values) == 0:
        return np.nan
    
    aoa_score = np.mean(aoa_values)

    return aoa_score

In [11]:
def zipf_score(text):
    '''Returns a frequency score (higher -> more frequent) based on the Zipf scale'''
    
    tokens = tokenize(text)
    tokens = clean_lexical_tokens(tokens)
    
    zipf_values = [zipf_frequency(word, 'en') for word in tokens]

     # if there are no words, return a default value
    if len(zipf_values) == 0:
        return np.nan

    zipf_score = np.mean(zipf_values)

    return zipf_score

## Loading the Data

In [12]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


In [13]:
N = 1000 # number of utterances to start with

# collect utterances into a list
utterances_data = []
for utt in corpus.iter_utterances():
    if hasattr(utt, "timestamp") and utt.text:
        utterances_data.append({
            "id": utt.id,
            "text": utt.text,
            "timestamp": utt.timestamp
        })

df = pd.DataFrame(utterances_data) # convert list to dataframe for easier access
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') # convert Reddit timestamps to datetime
df = df.sort_values('timestamp').reset_index(drop=True)
df = df.head(N)

ttrs, avg_lengths, aoas, zipfs = [], [], [], []
# loop through dataframe and display progress
for text in tqdm(df['text'], desc="Computing linguistic measures"):
    ttrs.append(ttr(text))
    avg_lengths.append(avg_word_length(text))
    aoas.append(aoa_score(text, aoa_dict))
    zipfs.append(zipf_score(text))

# add metrics to dataframe
df['ttr'] = ttrs
df['avg_word_length'] = avg_lengths
df['aoa_score'] = aoas
df['zipf_score'] = zipfs

Computing linguistic measures: 100%|██████████| 1000/1000 [00:00<00:00, 2349.91it/s]


In [None]:


plt.figure(figsize=(12,6))
for col in ['ttr', 'avg_word_length', 'aoa_score', 'zipf_score']:
    plt.plot(monthly_metrics.index, monthly_metrics[col], label=col)
plt.xlabel("Time")
plt.ylabel("Metric")
plt.title("Linguistic Quality Trends Over Time")
plt.legend()
plt.show()
