# Data Processing

### Speech Type Decoding

1: Partisan rallies

2: Formal congressional floor speeches

3: Bipartisan events

In [None]:
# required packages
import os 
import pandas as pd
import string
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')


In [21]:
def load_speeches_from_directory(base_dir="data"):
    data = []

    # loop through each folder in base_dir
    for person_folder in os.listdir(base_dir):
        person_path = os.path.join(base_dir, person_folder)

        if not os.path.isdir(person_path) or "_" not in person_folder:
            continue

        # loop through all text files
        for filename in os.listdir(person_path):
            if filename.startswith(".") or not filename.endswith(".txt"):
                continue

            # extract speech type
            speech_type_str = filename.split("_")[0]
            try:
                speech_type = int(speech_type_str)
            except ValueError:
                continue 

            file_path = os.path.join(person_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            data.append({
                "person": person_folder,
                "speech_type": speech_type,
                "text": text
            })

    return pd.DataFrame(data)

In [29]:
df = load_speeches_from_directory("data")
df.head()

Unnamed: 0,person,speech_type,text
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...
1,desantis_ron,1,Governor DeSantis (00:07):\nThank you. Thanks ...
2,desantis_ron,1,Gov. Ron DeSantis (00:00):\n\nMy fellow Republ...
3,desantis_ron,1,Ron DeSantis: (00:03) Thank you. Thank you so ...
4,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo..."


### Data Cleaning

In [23]:
# stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text_simple(text):
    # lowercase, remove punctuation, tokenize, and remove stopwords
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = re.findall(r'\b\w+\b', text)
    words = [w for w in words if w not in stop_words]
    # lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# clean the df
df["clean_text"] = df["text"].apply(clean_text_simple)
df.head()

Unnamed: 0,person,speech_type,text,clean_text
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...,cpac let welcome freest state united state luc...
1,desantis_ron,1,Governor DeSantis (00:07):\nThank you. Thanks ...,governor desantis 0007 thank thanks much god b...
2,desantis_ron,1,Gov. Ron DeSantis (00:00):\n\nMy fellow Republ...,gov ron desantis 0000 fellow republican let se...
3,desantis_ron,1,Ron DeSantis: (00:03) Thank you. Thank you so ...,ron desantis 0003 thank thank much always wond...
4,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo...",thank thank much well thank much past four yea...


In [31]:
def remove_timestamps(text):
    # removes timestamps from video transcripts (ie. 0700)
    text = re.sub(r"\(\s*\d{1,2}\s*[: ]\s*\d{2}\s*\)", "", text)
    text = re.sub(r"\b\d{1,2}:\d{2}\b", "", text)
    text = re.sub(r"\b\d{4}\b", "", text)
    text = re.sub(r"\b\d{1,2}\s+\d{2}\b", "", text)

    # collapse extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [32]:
df["clean_text"] = df["clean_text"].apply(remove_timestamps)

df.head()

Unnamed: 0,person,speech_type,text,clean_text
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...,cpac let welcome freest state united state luc...
1,desantis_ron,1,Governor DeSantis (00:07):\nThank you. Thanks ...,governor desantis thank thanks much god bless ...
2,desantis_ron,1,Gov. Ron DeSantis (00:00):\n\nMy fellow Republ...,gov ron desantis fellow republican let send jo...
3,desantis_ron,1,Ron DeSantis: (00:03) Thank you. Thank you so ...,ron desantis thank thank much always wonderful...
4,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo...",thank thank much well thank much past four yea...


# Linguistic Analysis

In [34]:
# sentence lengths / number of sentences
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return 0
    lengths = [len(word_tokenize(s)) for s in sentences]
    return sum(lengths) / len(lengths)

In [45]:
df["sentence_length"] = df["text"].apply(avg_sentence_length)
df.head()

Unnamed: 0,person,speech_type,text,clean_text,length,unique,sentence_length
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...,cpac let welcome freest state united state luc...,18.416667,0.250528,18.416667
1,desantis_ron,1,Governor DeSantis (00:07):\nThank you. Thanks ...,governor desantis thank thanks much god bless ...,21.833333,0.197943,21.833333
2,desantis_ron,1,Gov. Ron DeSantis (00:00):\n\nMy fellow Republ...,gov ron desantis fellow republican let send jo...,17.724138,0.388132,17.724138
3,desantis_ron,1,Ron DeSantis: (00:03) Thank you. Thank you so ...,ron desantis thank thank much always wonderful...,30.036364,0.230932,30.036364
4,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo...",thank thank much well thank much past four yea...,13.369863,0.353846,13.369863


In [35]:
# unique words / total words
def lexical_diversity(text):
    words = word_tokenize(text.lower())
    if len(words) == 0:
        return 0
    return len(set(words)) / len(words)

In [46]:
df["lexical_diversity"] = df["text"].apply(lexical_diversity)
df.head()

Unnamed: 0,person,speech_type,text,clean_text,length,unique,sentence_length,lexical_diversity
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...,cpac let welcome freest state united state luc...,18.416667,0.250528,18.416667,0.250528
1,desantis_ron,1,Governor DeSantis (00:07):\nThank you. Thanks ...,governor desantis thank thanks much god bless ...,21.833333,0.197943,21.833333,0.197943
2,desantis_ron,1,Gov. Ron DeSantis (00:00):\n\nMy fellow Republ...,gov ron desantis fellow republican let send jo...,17.724138,0.388132,17.724138,0.388132
3,desantis_ron,1,Ron DeSantis: (00:03) Thank you. Thank you so ...,ron desantis thank thank much always wonderful...,30.036364,0.230932,30.036364,0.230932
4,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo...",thank thank much well thank much past four yea...,13.369863,0.353846,13.369863,0.353846


In [47]:
# pronoun counter
PRONOUN_SETS = {
    "first_singular": {"i", "me", "my", "mine"},
    "first_plural": {"we", "us", "our", "ours"},
    "second_person": {"you", "your", "yours"},
    "third_person": {"he", "him", "his", "she", "her", "hers",
                     "they", "them", "their", "theirs"}
}

def pronoun_usage(text):
    words = word_tokenize(text.lower())
    counts = Counter()
    for w in words:
        for label, group in PRONOUN_SETS.items():
            if w in group:
                counts[label] += 1
    return counts

In [49]:
pronoun_df = df["text"].apply(pronoun_usage).apply(pd.Series).fillna(0)

pronoun_df.head()

Unnamed: 0,first_singular,second_person,first_plural,third_person
0,50,45,129,94
1,94,112,193,120
2,4,5,41,28
3,29,50,104,77
4,15,22,53,11


**TODO:**

- Artificial neural network as implemented in the word2vec mode, including skip-gram and continuous bag-of-word mechanisms (Python’s Genism package)

- Computational semantic space modeling to identify whether recurring themes differ depending on audience

- TF-IDF analysis for each setting of speech by politician

- Embedding-based analysis (similarity, distance based, vector spaces, plane spaces) to measure a politician’s semantic speech space

- Clustering (Hierarchical clustering, GMM) or classification models (SVMs, Mixed-Effects Logistic Regression, Softmax Regression) trained on one setting (ie. partisan rallies) and tested on another (ie. formal congressional floor speeches) to evaluate speech style