# Sentiment Analysis

In [12]:
# import necessary packages
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import plotly.express as px

In [4]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

Device set to use mps:0


In [5]:
file_path = Path("data/processed")
politician = []
type_of_speech = []
file_name= []
text = []

for file in file_path.rglob("*.txt"):
    #separated_file_name = str(file).split("\\") # windows

    #politician.append(separated_file_name[2])
    #type_of_speech.append(separated_file_name[3])
    #file_name.append(separated_file_name[4])
    
    parts = file.parts # Safe for all OS
    politician.append(parts[-3])
    type_of_speech.append(parts[-2])
    file_name.append(parts[-1])
    
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        text.append(content)

df = pd.DataFrame({
    'politician': politician,
    'type_of_speech': type_of_speech,
    'file_name': file_name,
    'text': text
})
df

Unnamed: 0,politician,type_of_speech,file_name,text
0,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_senators_king_collins_celeb...,skip to content click here to sign up for the ...
1,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_experts_on_i...,skip to content click here to sign up for the ...
2,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_backed_legislation_wou...,skip to content click here to sign up for the ...
3,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_colleagues_call_on_maj...,skip to content click here to sign up for the ...
4,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_strategic_co...,skip to content click here to sign up for the ...
...,...,...,...,...
2023,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_urges_doj...,
2024,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_dhs_must_...,
2025,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_press_releases_congressman_...,
2026,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_applauds_...,


### Hugging Face Model Limitations

Only accepts 512 tokens at a time. . .

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/distilbert-base-uncased-emotion"
)

def chunk_text(text, max_tokens=512):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    # break tokens into lists of <=512
    chunked = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    # decode each chunk back to text
    return [tokenizer.decode(chunk) for chunk in chunked]

In [8]:
def classify_long_text(text, classifier, tokenizer, max_tokens=512):
    if not isinstance(text, str) or text.strip() == "":
        return {label: 0.0 for label in ["sadness","joy","love","anger","fear","surprise"]}

    # Tokenize text
    tokens = tokenizer.encode(text, add_special_tokens=False)

    # Split tokens into chunks <= max_tokens
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    
    # Decode each chunk and truncate to max_tokens just in case
    decoded_chunks = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

    # Run classifier with truncation
    results = classifier(decoded_chunks, truncation=True, max_length=max_tokens)

    # Extract labels
    labels = [d["label"] for d in results[0]]
    score_matrix = np.array([[d["score"] for d in chunk] for chunk in results])
    avg_scores = score_matrix.mean(axis=0)

    return dict(zip(labels, avg_scores))


In [9]:
# small batch test
classify_long_text(df.loc[0, "text"], classifier, tokenizer)

{'sadness': np.float64(0.023634135723114014),
 'joy': np.float64(0.7674825191497803),
 'love': np.float64(0.019324984401464462),
 'anger': np.float64(0.16767126321792603),
 'fear': np.float64(0.019324718043208122),
 'surprise': np.float64(0.002562319627031684)}

In [None]:
###WARNING LONG RUNTIME###

df["sentiment_scores"] = df["text"].apply(lambda t: classify_long_text(t, classifier, tokenizer))
sentiment_df = df["sentiment_scores"].apply(pd.Series)
df = pd.concat([df, sentiment_df], axis = 1) #Creating one df with each sentiment score as its own column

Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors


In [10]:
df.head()

Unnamed: 0,politician,type_of_speech,file_name,text
0,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_senators_king_collins_celeb...,skip to content click here to sign up for the ...
1,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_experts_on_i...,skip to content click here to sign up for the ...
2,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_backed_legislation_wou...,skip to content click here to sign up for the ...
3,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_colleagues_call_on_maj...,skip to content click here to sign up for the ...
4,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_strategic_co...,skip to content click here to sign up for the ...


In [None]:
#Saving new df with columns for each sentiment to output_sentiment

df.to_csv("output_sentiment.csv", index = False)
# df1, df2, df3 = np.array_split(df, 3)

In [None]:
#df1.to_csv("output1.csv", index=False)
#df2.to_csv("output2.csv", index=False)
#df3.to_csv("output3.csv", index=False)

## Perform Topic Modeling using output_sentiment.csv

### Functions to perform: topic modeling, semantic embeddings & clustering, deeper sentiment analysis, and analysis by group

In [2]:
#Load and clean dataframe
sentiment_df = pd.read_csv("output_sentiment.csv")
#sentiment_df
sentiment_df = sentiment_df.drop(columns="sentiment_scores")
        
sentiment_df = sentiment_df.dropna(subset=['text'])
sentiment_df

Unnamed: 0,politician,type_of_speech,file_name,text,sadness,joy,love,anger,fear,surprise
0,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_senators_king_collins_celeb...,skip to content click here to sign up for the ...,0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
1,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_experts_on_i...,skip to content click here to sign up for the ...,0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
2,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_backed_legislation_wou...,skip to content click here to sign up for the ...,0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
3,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_colleagues_call_on_maj...,skip to content click here to sign up for the ...,0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
4,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_strategic_co...,skip to content click here to sign up for the ...,0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
...,...,...,...,...,...,...,...,...,...,...
2013,biggs_andy,partisan_rally_speeches,"Rep._andy_biggs__rest_in_peace_charlie_kirk,_l...","today, we tragically lost a tremendous leader ...",0.175069,0.719658,0.078390,0.020801,0.004257,0.001824
2014,biggs_andy,partisan_rally_speeches,Newsmaker_sunday__andy_biggs_processed.txt,"thanks for joining us on fox 10 news, mick rou...",0.057725,0.588294,0.003355,0.271869,0.067209,0.011548
2015,biggs_andy,partisan_rally_speeches,Rep._-_elect_andy_biggs_&_rep.-elect_tom_o'hal...,"music coming up next on arizona horizon, a vis...",0.014660,0.732428,0.002297,0.225168,0.023354,0.002093
2016,biggs_andy,partisan_rally_speeches,Rep._andy_biggs_on_health_care__republicans_ne...,"house freedom caucus, femma or congressmen, an...",0.003175,0.499923,0.002003,0.388193,0.017581,0.089124


Helper function for performing modeling. Returns: topics with top words used, topic distributions, the NMF model used, and the TF-IDF Vectorizer used.
Uses tfidf vectorizer for NMF topic modeling

In [3]:
def perform_nmf_topic_modeling(texts, n_topics = 5, n_top_words = 10):
    #Perform NMF topic modeling
    if hasattr(texts, 'tolist'):
        texts_list = texts.tolist()
    else:
        texts_list = list(texts)
        
    texts_cleaned = []
    for text in texts_list:
        if pd.isna(text) or text is None:
            texts_cleaned.append("")
        elif isinstance(text, str):
            texts_cleaned.append(text)
        else:
            texts_cleaned.append(str(text))
    
    vectorizer = TfidfVectorizer(max_features = 1000, stop_words = 'english', max_df = 0.8, min_df = 2)
    tfidf_matrix = vectorizer.fit_transform(texts_cleaned)
    
    nmf = NMF(n_components=n_topics, random_state=42, max_iter = 200)
    nmf.fit(tfidf_matrix)
    
    #Handle both old and new scikit-learn versions
    try:
        feature_names = vectorizer.get_feature_names_out()
    except AttributeError:
        feature_names = vectorizer.get_feature_names()
    
    #get features
    feature_names = vectorizer.get_feature_names_out()
    
    #get topics
    topics = {}
    for topic_idx, topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics[f"Topic_{topic_idx}"] = top_words
        
    topic_distributions = nmf.transform(tfidf_matrix)
    return topics, topic_distributions, nmf, vectorizer

In [4]:
#Perform topic modeling
topics_nmf, topic_dist_nmf, nmf_model, nmf_vectorizer = perform_nmf_topic_modeling(sentiment_df['text'], n_topics=7)

print("\nNMF Topics:")
for topic_name, words in topics_nmf.items():
    print(f"{topic_name}: {', '.join(words)}")

# Add dominant topic to dataframe
sentiment_df['dominant_topic_lda'] = topic_dist_nmf.argmax(axis=1)
for i in range(topic_dist_nmf.shape[1]):
    sentiment_df[f'topic_{i}_score'] = topic_dist_nmf[:, i]


NMF Topics:
Topic_0: people, going, know, think, want, thank, just, right, like, country
Topic_1: mr, speaker, congressional, record, house, office, page, online, january, number
Topic_2: senator, 2025, trump, washington, senate, senators, november, administration, statement, act
Topic_3: presiding, mr, officer, president, senate, clerk, motion, senator, yeas, nays
Topic_4: services, veterans, appropriations, press, agency, voting, committee, grants, capitol, student
Topic_5: health, care, tax, families, republicans, insurance, affordable, costs, people, healthcare
Topic_6: mr, speaker, act, shall, section, amendment, time, yield, states, gentleman


Helper function for semantic embeddings and clustering. Returns an array of embeddings.

In [5]:
def get_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=8):
    """Get embeddings safely"""
    
    # Clean texts
    texts_cleaned = [str(t).strip() if pd.notna(t) and isinstance(t, str) else "" 
                     for t in texts]
    
    valid_indices = [i for i, t in enumerate(texts_cleaned) if t]
    valid_texts = [texts_cleaned[i] for i in valid_indices]
    
    print(f"Encoding {len(valid_texts)} texts")
    
    # Load model (only once)
    model = SentenceTransformer(model_name)
    
    # Encode
    valid_embeddings = model.encode(
        valid_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=False
    )
    
    # Create full array
    embedding_dim = valid_embeddings.shape[1]
    all_embeddings = np.zeros((len(texts_cleaned), embedding_dim))
    all_embeddings[valid_indices] = valid_embeddings
    
    return all_embeddings

Helper function for performing k-means clustering. Returns the clusters, and k

In [6]:
#Get embeddings and perform clustering
#embeddings = get_embeddings(sentiment_df['text'].tolist())

def perform_clustering(embeddings, n_clusters=5):
    """Perform k-means clustering"""
    
    # Remove zero rows
    valid_mask = ~np.all(embeddings == 0, axis=1)
    valid_embeddings = embeddings[valid_mask]
    
    print(f"Clustering {len(valid_embeddings)} valid embeddings")
    
    kmeans = KMeans(
        n_clusters=min(n_clusters, len(valid_embeddings)),
        random_state=42,
        n_init=10
    )
    
    valid_clusters = kmeans.fit_predict(valid_embeddings)
    
    # Map back
    clusters = np.full(len(embeddings), -1)
    clusters[valid_mask] = valid_clusters
    
    return clusters, kmeans

In [None]:
#Get embeddings and perform clustering. Add cluster column to df
embeddings = get_embeddings(sentiment_df['text'].tolist())

clusters, kmeans_model = perform_clustering(embeddings, n_clusters=5)
sentiment_df['cluster'] = clusters

#Generate clusters using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(sentiment_df)-1))
embeddings_2d = tsne.fit_transform(embeddings)

#Add embeddings to df
sentiment_df['tsne_x'] = embeddings_2d[:,0]
sentiment_df['tsne_y'] = embeddings_2d[:,1]

Encoding 1762 texts


Batches:   0%|          | 0/221 [00:00<?, ?it/s]

Clustering 1762 valid embeddings


In [11]:
print(sentiment_df)

           politician                 type_of_speech  \
0     king_angus_s_jr  bipartisan_and_other_speeches   
1     king_angus_s_jr  bipartisan_and_other_speeches   
2     king_angus_s_jr  bipartisan_and_other_speeches   
3     king_angus_s_jr  bipartisan_and_other_speeches   
4     king_angus_s_jr  bipartisan_and_other_speeches   
...               ...                            ...   
2013       biggs_andy        partisan_rally_speeches   
2014       biggs_andy        partisan_rally_speeches   
2015       biggs_andy        partisan_rally_speeches   
2016       biggs_andy        partisan_rally_speeches   
2017       biggs_andy        partisan_rally_speeches   

                                              file_name  \
0     bipartisan_unknown_senators_king_collins_celeb...   
1     bipartisan_unknown_king_questions_experts_on_i...   
2     bipartisan_unknown_king_backed_legislation_wou...   
3     bipartisan_unknown_king_colleagues_call_on_maj...   
4     bipartisan_unknown_king_qu

In [16]:
#Visualize tsne clusters
fig = px.scatter(sentiment_df, x="tsne_x", y="tsne_y", hover_name="politician")
fig.show()