# 1. Load Dataset

In [2]:
import pandas as pd

file_path = "../data/dataset.json"
df = pd.read_json(file_path)

df.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


# 2. Run Sentiment Analysis to Get Labels and Confidence Scores

In [3]:
from transformers import pipeline

# Load the sentiment analysis pipeline with the Twitter-specific model
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# Apply the pipeline to the 'text' column
results = df['text'].apply(sentiment_pipeline)

# Extract the label and score for each result and add them as new columns
df['sentiment_label'] = results.apply(lambda x: x[0]['label'])
df['sentiment_score'] = results.apply(lambda x: x[0]['score'])

# Display the updated DataFrame
df.head()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


Unnamed: 0,timestamp,text,text_id,user,user_id,sentiment_label,sentiment_score
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,positive,0.781216
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,negative,0.882852
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642,negative,0.551017
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,positive,0.747033
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936,negative,0.943839


In [4]:
# Save the DataFrame to a CSV file
output_csv_path = "../data/sentiment_analysis_results.csv"  # Specify your file name
df.to_csv(output_csv_path, index=False)

print(f"DataFrame saved to {output_csv_path}")


DataFrame saved to ../data/sentiment_analysis_results.csv


# 3. Separate by Sentiment Label

In [6]:
# Create separate lists for each sentiment
positive_scores = df.loc[df['sentiment_label'] == 'positive', 'sentiment_score'].tolist()
neutral_scores = df.loc[df['sentiment_label'] == 'neutral', 'sentiment_score'].tolist()
negative_scores = df.loc[df['sentiment_label'] == 'negative', 'sentiment_score'].tolist()

# Print the results for verification
print(f"Number of Positive Scores: {len(positive_scores)}")
print(f"Number of Neutral Scores: {len(neutral_scores)}")
print(f"Number of Negative Scores: {len(negative_scores)}")

Number of Positive Scores: 39298
Number of Neutral Scores: 17935
Number of Negative Scores: 13027


# 4. Apply K-Means to Each Group

In [7]:
from sklearn.cluster import KMeans
import numpy as np

# Define a function to apply K-Means clustering
def apply_kmeans(scores, n_clusters=3):
    scores_array = np.array(scores).reshape(-1, 1)  # Reshape for K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)  # Initialize K-Means
    clusters = kmeans.fit_predict(scores_array)  # Fit and predict clusters
    return clusters, kmeans.cluster_centers_

# Apply K-Means to each sentiment group
positive_clusters, positive_centers = apply_kmeans(positive_scores)
neutral_clusters, neutral_centers = apply_kmeans(neutral_scores)
negative_clusters, negative_centers = apply_kmeans(negative_scores)

# Display cluster centers for verification
print(f"Positive Cluster Centers: {sorted(positive_centers.flatten())}")
print(f"Neutral Cluster Centers: {sorted(neutral_centers.flatten())}")
print(f"Negative Cluster Centers: {sorted(negative_centers.flatten())}")


Positive Cluster Centers: [np.float64(0.5943230856781043), np.float64(0.8043295104286502), np.float64(0.9542473458008959)]
Neutral Cluster Centers: [np.float64(0.5408110465505729), np.float64(0.6979260955658485), np.float64(0.8720885476951947)]
Negative Cluster Centers: [np.float64(0.5576972346155221), np.float64(0.7310676325062735), np.float64(0.8803989664225644)]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# 5. Combine Sentiment Label with Strength

In [8]:
# Define a function to map clusters to Weak, Mid, Strong
def assign_cluster_labels(scores, clusters, centers):
    # Map each cluster to Weak, Mid, or Strong based on sorted cluster centers
    sorted_centers = sorted((val, idx) for idx, val in enumerate(centers.flatten()))
    cluster_mapping = {sorted_centers[0][1]: "Weak",
                       sorted_centers[1][1]: "Mid",
                       sorted_centers[2][1]: "Strong"}

    # Map cluster assignments to labels
    cluster_labels = [cluster_mapping[c] for c in clusters]
    return cluster_labels

# Assign labels for each sentiment group
positive_labels = assign_cluster_labels(positive_scores, positive_clusters, positive_centers)
neutral_labels = assign_cluster_labels(neutral_scores, neutral_clusters, neutral_centers)
negative_labels = assign_cluster_labels(negative_scores, negative_clusters, negative_centers)

# Add cluster labels back to the original DataFrame
df['strength'] = None  # Initialize the 'strength' column

df.loc[df['sentiment_label'] == 'positive', 'strength'] = positive_labels
df.loc[df['sentiment_label'] == 'neutral', 'strength'] = neutral_labels
df.loc[df['sentiment_label'] == 'negative', 'strength'] = negative_labels

# Display a few rows to verify
print("Updated DataFrame with strength labels:")
display(df[['text', 'sentiment_label', 'sentiment_score', 'strength']].head())


Updated DataFrame with strength labels:


Unnamed: 0,text,sentiment_label,sentiment_score,strength
0,Running a business means juggling countless ad...,positive,0.781216,Mid
1,Liz Truss is walking in the lingering shadow o...,negative,0.882852,Strong
2,The UK is bracing for war as government buildi...,negative,0.551017,Weak
3,Marrying a second or third cousin once removed...,positive,0.747033,Mid
4,It's truly disgraceful how the Indian National...,negative,0.943839,Strong


In [11]:
output_csv_path = "../data/strength_dataset.csv"  # Specify the desired file path

# Save the updated DataFrame to a CSV file
df.to_csv(output_csv_path, index=False)

## Classyfiying the texts into topics

In [12]:
df = pd.read_csv("../data/strength_dataset.csv")

# Initialize the topic classification pipeline
topic_pipeline = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", truncation=True)

# Classify tweets into topics using the topic classification pipeline
df['topic'] = df['text'].apply(lambda x: topic_pipeline(x)[0]['label'])

Device set to use mps:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
# Save the updated DataFrame with the topic labels to a CSV file
output_csv_path = "../data/sentiment_analysis_with_topics.csv"  # Specify the output file path

# Save the DataFrame to a CSV file
df.to_csv(output_csv_path, index=False)

# 6. Visualize the Results

# 7. Hypothesis

## Users with extreme sentiments (strongly positive or negative) occupy central positions in the network.

In [9]:
import pandas as pd
import networkx as nx

# Load the social network edge list
graph_df = pd.read_csv("../data/graph.csv")

# Create a graph using NetworkX
G = nx.from_pandas_edgelist(graph_df, source='source', target='target', edge_attr='weight', create_using=nx.Graph())

# Calculate centrality measures
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)

# Convert centrality dictionaries to DataFrame
centrality_df = pd.DataFrame({
    'user_id': degree_centrality.keys(),
    'degree_centrality': degree_centrality.values(),
    'betweenness_centrality': betweenness_centrality.values(),
    'eigenvector_centrality': eigenvector_centrality.values()
})

# Load the sentiment analysis results
sentiment_df = pd.read_csv("../data/sentiment_analysis_results.csv")

# Merge centrality data with sentiment data
combined_df = pd.merge(sentiment_df, centrality_df, on='user_id', how='inner')

# Display the combined dataset
print("Combined dataset for hypothesis testing:")
display(combined_df.head())


PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')

In [None]:
output_csv_path = "../data/combined_dataset.csv"

# Save the merged DataFrame to a CSV file
combined_df.to_csv(output_csv_path, index=False)

In [None]:
# Filter for strong sentiments
extreme_sentiments = combined_df[
    (combined_df['strength'] == 'Strong') &
    (combined_df['sentiment_label'].isin(['positive', 'negative']))
]

# Display the filtered dataset
print("Filtered dataset with extreme sentiments:")
display(extreme_sentiments.head())


In [None]:
from scipy.stats import pearsonr, spearmanr

# Correlate degree centrality with extreme sentiment
pearson_degree, _ = pearsonr(extreme_sentiments['degree_centrality'], extreme_sentiments['sentiment_score'])
spearman_degree, _ = spearmanr(extreme_sentiments['degree_centrality'], extreme_sentiments['sentiment_score'])

# Correlate eigenvector centrality with extreme sentiment
pearson_eigen, _ = pearsonr(extreme_sentiments['eigenvector_centrality'], extreme_sentiments['sentiment_score'])
spearman_eigen, _ = spearmanr(extreme_sentiments['eigenvector_centrality'], extreme_sentiments['sentiment_score'])

# Print correlation results
print(f"Pearson Correlation (Degree Centrality): {pearson_degree}")
print(f"Spearman Correlation (Degree Centrality): {spearman_degree}")
print(f"Pearson Correlation (Eigenvector Centrality): {pearson_eigen}")
print(f"Spearman Correlation (Eigenvector Centrality): {spearman_eigen}")
