## Quantifying Linguistic Degradation

To quantify lexical quality, I will rely on the following metrics, the rational for which can be found in my thesis report:
<br>

1. Measure of Textual, Lexical Diversity (MTLD)
2. Yule's Characertistic Constant (K)
3. Zipf Distribution's Z Parameter (Z Score)
4. Average Age of Acquisition (Mean AoA)
5. Ratio of Words in the New Academic Word List (NAWL Ratio)

In choosing these metrics, I successfully capture lexical diversity, frequency distributions, word probability distributions, and word sophistication.

## Package Imports

In [13]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# set up nltk tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
nltk.download('punkt_tab')

# set up nltk lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# imports specific to lexical measures
import re
from wordfreq import zipf_frequency
from lexical_diversity import lex_div as ld


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  import pkg_resources


## Lexical Helper Functions

In [9]:
def tokenize(text):
    '''Helper function to tokenize social media text. Note that the TweetTokenizer 
    preserves mentions, contractions, and other social media-specific structures'''
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    return tokens

In [10]:
def lemmatize(tokens):
    '''Helper function to lemmatize tokens.'''

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

In [11]:
def clean_tokens_lexical(text):
    '''Helper function that tokenizes text, cleans tokens by removing punctuation, numbers, and emojis
    for purely lexical analysis, and returns the cleaned, lemmatized tokens.'''

    # tokenize text
    tokens = tokenize(text)

    # clean tokens
    cleaned = []
    for tok in tokens:
        # skip over punctuation
        if re.match(r'^\W+$', tok):
            continue
        # skip over emojis

        # only keep alphabetic tokens
        if tok.isalpha():
            cleaned.append(tok.lower())

    # lemmatize clean tokens
    lemmatized = lemmatize(cleaned)

    return lemmatized

## Lexical Analysis Functions

In [14]:
def mtld_score(clean_tokens):
    '''Function that returns the MTLD score for a given set of cleaned, lemmatized tokens'''

    # compute mtld
    mtld_score = ld.mtld(clean_tokens)

    return mtld_score

In [15]:
def yules_K(clean_tokens):
    '''Function that returns Yule's characteristic constant K for a given set of 
    cleaned, lemmatized tokens'''

    K = None

    return K

In [16]:
def zipf_score(clean_tokens):
    '''Returns the average frequency score (higher -> more frequent) based on the Zipf scale
    for a given set of cleaned, lemmatized tokens'''
    
    # compute Zipf parameter Z for each word
    zipf_values = [zipf_frequency(word, 'en') for word in clean_tokens]

     # if there are no words, return a default value
    if len(zipf_values) == 0:
        return np.nan

    # find the average zipf parameter
    zipf_score = np.mean(zipf_values)

    return zipf_score

In [None]:
# build aoa_dict: word -> average age of acquisition
aoa_df = pd.read_csv("Data/KupermanAoAData.csv")
aoa_dict = dict(zip(aoa_df["word"], aoa_df["rating_mean"]))

def aoa_score(clean_tokens, aoa_dict):
    '''Returns the average age of acquisition score for a set of cleaned, lemmatized tokens'''
    
    # extract aoa value only if the word is in the AoA dict
    aoa_values = [aoa_dict[word] for word in clean_tokens if word in aoa_dict]

    # if there are no words, return a default value
    if len(aoa_values) == 0:
        return np.nan
    
    # average the aoa values across all words
    aoa_score = np.mean(aoa_values)

    return aoa_score

In [None]:
def nawl_ratio(clean_tokens):
    '''Returns '''