# Sentiment Analysis

In [33]:
# import necessary packages

import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
import torch
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

Device set to use mps:0


In [17]:
file_path = Path("data/processed")
politician = []
type_of_speech = []
file_name= []
text = []

for file in file_path.rglob("*.txt"):
    #separated_file_name = str(file).split("\\") # windows

    #politician.append(separated_file_name[2])
    #type_of_speech.append(separated_file_name[3])
    #file_name.append(separated_file_name[4])
    
    parts = file.parts # Safe for all OS
    politician.append(parts[-3])
    type_of_speech.append(parts[-2])
    file_name.append(parts[-1])
    
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        text.append(content)

df = pd.DataFrame({
    'politician': politician,
    'type_of_speech': type_of_speech,
    'file_name': file_name,
    'text': text
})
df

Unnamed: 0,politician,type_of_speech,file_name,text
0,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_senators_king_collins_celeb...,skip to content click here to sign up for the ...
1,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_experts_on_i...,skip to content click here to sign up for the ...
2,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_backed_legislation_wou...,skip to content click here to sign up for the ...
3,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_colleagues_call_on_maj...,skip to content click here to sign up for the ...
4,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_strategic_co...,skip to content click here to sign up for the ...
...,...,...,...,...
2023,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_urges_doj...,
2024,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_dhs_must_...,
2025,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_press_releases_congressman_...,
2026,biggs_andy,bipartisan_and_other_speeches,bipartisan_unknown_congressman_biggs_applauds_...,


### Hugging Face Model Limitations

Only accepts 512 tokens at a time. . .

In [18]:
tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/distilbert-base-uncased-emotion"
)

def chunk_text(text, max_tokens=512):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    # break tokens into lists of <=512
    chunked = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    # decode each chunk back to text
    return [tokenizer.decode(chunk) for chunk in chunked]

In [25]:
def classify_long_text(text, classifier, tokenizer, max_tokens=512):
    if not isinstance(text, str) or text.strip() == "":
        return {label: 0.0 for label in ["sadness","joy","love","anger","fear","surprise"]}

    # Tokenize text
    tokens = tokenizer.encode(text, add_special_tokens=False)

    # Split tokens into chunks <= max_tokens
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    
    # Decode each chunk and truncate to max_tokens just in case
    decoded_chunks = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

    # Run classifier with truncation
    results = classifier(decoded_chunks, truncation=True, max_length=max_tokens)

    # Extract labels
    labels = [d["label"] for d in results[0]]
    score_matrix = np.array([[d["score"] for d in chunk] for chunk in results])
    avg_scores = score_matrix.mean(axis=0)

    return dict(zip(labels, avg_scores))


In [26]:
# small batch test
classify_long_text(df.loc[0, "text"], classifier, tokenizer)

{'sadness': np.float64(0.023634135723114014),
 'joy': np.float64(0.7674825191497803),
 'love': np.float64(0.019324984401464462),
 'anger': np.float64(0.16767126321792603),
 'fear': np.float64(0.019324718043208122),
 'surprise': np.float64(0.002562319627031684)}

In [27]:
df["sentiment_scores"] = df["text"].apply(lambda t: classify_long_text(t, classifier, tokenizer))
sentiment_df = df["sentiment_scores"].apply(pd.Series)
df = pd.concat([df, sentiment_df], axis = 1) #Creating one df with each sentiment score as its own column

In [28]:
df.head()

Unnamed: 0,politician,type_of_speech,file_name,text,sentiment_scores,sadness,joy,love,anger,fear,surprise
0,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_senators_king_collins_celeb...,skip to content click here to sign up for the ...,"{'sadness': 0.023634135723114014, 'joy': 0.767...",0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
1,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_experts_on_i...,skip to content click here to sign up for the ...,"{'sadness': 0.023634135723114014, 'joy': 0.767...",0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
2,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_backed_legislation_wou...,skip to content click here to sign up for the ...,"{'sadness': 0.023634135723114014, 'joy': 0.767...",0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
3,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_colleagues_call_on_maj...,skip to content click here to sign up for the ...,"{'sadness': 0.023634135723114014, 'joy': 0.767...",0.023634,0.767483,0.019325,0.167671,0.019325,0.002562
4,king_angus_s_jr,bipartisan_and_other_speeches,bipartisan_unknown_king_questions_strategic_co...,skip to content click here to sign up for the ...,"{'sadness': 0.023634135723114014, 'joy': 0.767...",0.023634,0.767483,0.019325,0.167671,0.019325,0.002562


In [29]:
#Saving new df with columns for each sentiment to output_sentiment
df.to_csv("output_sentiment.csv", index = False)
# df1, df2, df3 = np.array_split(df, 3)

In [None]:
df1.to_csv("output1.csv", index=False)
df2.to_csv("output2.csv", index=False)
df3.to_csv("output3.csv", index=False)

## Perform Topic Modeling using output_sentiment.csv

### Functions to perform: topic modeling, semantic embeddings & clustering, deeper sentiment analysis, and analysis by group

Helper function for performing modeling. Returns: topics with top words used, topic distributions, the NMF model used, and the TF-IDF Vectorizer used.
Uses tfidf vectorizer for NMF topic modeling

In [37]:
sentiment_df = pd.read_csv("output_sentiment.csv")

def perform_nmf_topic_modeling(texts, n_topics = 5, n_top_words = 10):
    #Perform NMF topic modeling
    
    vectorizer = TfidfVectorizer(max_features = 1000, stop_words = 'english', max_df = 0.8, min_df = 2)
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    nmf = NMF(n_components=n_topics, random_state=42, max_iter = 200)
    nmf.fit(tfidf_matrix)
    
    #Handle both old and new scikit-learn versions
    try:
        feature_names = vectorizer.get_feature_names_out()
    except AttributeError:
        feature_names = vectorizer.get_feature_names()
    
    #get features
    feature_names = vectorizer.get_feature_names_out()
    
    #get topics
    topics = {}
    for topic_idx, topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics[f"Topic_{topic_idx}"] = top_words
        
    topic_distributions = nmf.transform(tfidf_matrix)
    return topics, topic_distributions, nmf, vectorizer

In [38]:
#Perform topic modeling
topics_nmf, topic_dist_nmf, nmf_model, nmf_vectorizer = perform_nmf_topic_modeling(df['text'], n_topics=5)

print("\nNMF Topics:")
for topic_name, words in topics_nmf.items():
    print(f"{topic_name}: {', '.join(words)}")

# Add dominant topic to dataframe
sentiment_df['dominant_topic_lda'] = topic_dist_nmf.argmax(axis=1)
for i in range(topic_dist_nmf.shape[1]):
    sentiment_df[f'topic_{i}_score'] = topic_dist_nmf[:, i]


NMF Topics:
Topic_0: people, going, know, think, just, want, thank, right, like, country
Topic_1: mr, speaker, act, shall, section, time, gentleman, amendment, yield, states
Topic_2: senator, 2025, trump, senate, president, washington, senators, administration, act, read
Topic_3: mr, record, congressional, house, office, page, speaker, number, president, january
Topic_4: health, care, veterans, services, families, legislation, appropriations, insurance, help, committee


Helper function for semantic embeddings and clustering. Returns an array of embeddings.

In [42]:
def get_embeddings(texts, model_name = 'sentence-transformers/all-MiniLM-L6-v2'):
    #Get semantic embeddings using Hugging Face models
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    for text in texts:
        if not isinstance(text, str) or text.strip() == "":
            embedding_dim = model.config.hidden_size
            embeddings.append(np.zeros(embedding_dim))
            continue
        
        #Tokenize and truncate
        inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
        
        #Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            #Use mean pooling
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(embedding)
            
    return np.array(embeddings)

Helper function for performing k-means clustering. Returns the clusters, and k

In [43]:
#Get embeddings and perform clustering
#embeddings = get_embeddings(sentiment_df['text'].tolist())

def perform_clustering(embeddings, n_clusters = 5):
    #Perform k-means clusttering on embeddings
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(embeddings)
    return clusters, kmeans

In [44]:
#Get embeddings and perform clustering. Add cluster column to df
embeddings = get_embeddings(sentiment_df['text'].tolist())

clusters, kmeans_model = perform_clustering(embeddings, n_clusters=5)
sentiment_df['cluster'] = clusters

#Visualize clusters using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df)-1))
embeddings_2d = tsne.fit_transform(embeddings)

#Add embeddings to df
df['tsne_x'] = embeddings[:,0]
df['tsne_y'] = embeddings[:,1]

python(70551) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


: 