# Data Processing

### Speech Type Decoding

1: Partisan rallies

2: Formal congressional floor speeches

3: Bipartisan events

In [15]:
# required packages
import os 
import pandas as pd
import string
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
def load_speeches_from_directory(base_dir="data"):
    data = []

    # loop through each folder in base_dir
    for person_folder in os.listdir(base_dir):
        person_path = os.path.join(base_dir, person_folder)

        if not os.path.isdir(person_path) or "_" not in person_folder:
            continue

        # loop through all text files
        for filename in os.listdir(person_path):
            if filename.startswith(".") or not filename.endswith(".txt"):
                continue

            # extract speech type
            speech_type_str = filename.split("_")[0]
            try:
                speech_type = int(speech_type_str)
            except ValueError:
                continue 

            file_path = os.path.join(person_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            data.append({
                "person": person_folder,
                "speech_type": speech_type,
                "text": text
            })

    return pd.DataFrame(data)

In [17]:
df = load_speeches_from_directory("data")
df.head()

Unnamed: 0,person,speech_type,text
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...
1,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo..."
2,desantis_ron,2,"Mr. Speaker, Mr. President, members of the cab..."
3,desantis_ron,3,"Mr. Chief Justice, Senator and Mrs. Scott, mem..."


### Data Cleaning

In [16]:
# stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text_simple(text):
    # lowercase, remove punctuation, tokenize, and remove stopwords
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = re.findall(r'\b\w+\b', text)
    words = [w for w in words if w not in stop_words]
    # lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# clean the df
df["clean_text"] = df["text"].apply(clean_text_simple)
df.head()

Unnamed: 0,person,speech_type,text,clean_text
0,desantis_ron,1,CPAC. Let me welcome you to the freest state ...,cpac let welcome freest state united state luc...
1,desantis_ron,1,"Thank you. Thank you very much. Well, thank yo...",thank thank much well thank much past four yea...
2,desantis_ron,2,"Mr. Speaker, Mr. President, members of the cab...",mr speaker mr president member cabinet legisla...
3,desantis_ron,3,"Mr. Chief Justice, Senator and Mrs. Scott, mem...",mr chief justice senator mr scott member cabin...


**TODO:**

- Linguistic feature analysis (ie. average sentence length, lexical diversity, pronoun use to capture shifts in formality or inclusivity, words used to conclude sentences)

- Artificial neural network as implemented in the word2vec mode, including skip-gram and continuous bag-of-word mechanisms (Python’s Genism package)

- Computational semantic space modeling to identify whether recurring themes differ depending on audience

- TF-IDF analysis for each setting of speech by politician

- Embedding-based analysis (similarity, distance based, vector spaces, plane spaces) to measure a politician’s semantic speech space

- Clustering (Hierarchical clustering, GMM) or classification models (SVMs, Mixed-Effects Logistic Regression, Softmax Regression) trained on one setting (ie. partisan rallies) and tested on another (ie. formal congressional floor speeches) to evaluate speech style