In [1]:
import numpy as np
import pandas as pd
import time
import math

In [2]:
twitter_data = pd.read_csv("samp_twitter_sentiment2.csv")

In [3]:
# Group the data by the 'Search_Key-Words' column, and select the first 1000 rows of each group
subset_twitter_data = twitter_data.groupby('Search_Key-Words').head(1000)

# Select only the 'text' and 'Search_Key-Words' columns
subset_twitter_data['date'] = pd.to_datetime(twitter_data['created_at'], format='%Y-%m-%d %H:%M:%S')

subset_twitter_data = subset_twitter_data.loc[:, ['text', 'Search_Key-Words', 'date']]
subset_twitter_data = subset_twitter_data[~subset_twitter_data['Search_Key-Words'].str.contains('\+')]
subset_twitter_data = subset_twitter_data[~subset_twitter_data['Search_Key-Words'].str.contains('ukriane')]

# Convert the DataFrame to a list of tuples
data_list = [tuple(x) for x in subset_twitter_data.to_records(index=False)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_twitter_data['date'] = pd.to_datetime(twitter_data['created_at'], format='%Y-%m-%d %H:%M:%S')


In [4]:
len(data_list)

5000

In [5]:
from transformers import pipeline

# Load the sentiment analysis pipeline
classifier = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')

data_list_with_sentiment = []

start = time.time()

# Perform sentiment analysis on each text string
for triplet in data_list:
    result = classifier(triplet[0])
    
    data_list_with_sentiment.append((triplet[0], triplet[1], triplet[2], result[0]["label"], result[0]["score"]))
    
end = time.time()

print("The time to apply the RoBERTa model: " + str(math.ceil(end - start)) + " seconds")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The time to apply the RoBERTa model: 311 seconds


In [6]:
# Same process as before but with distilbert
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

data_list_with_sentiment_distilbert = []

start = time.time()

for triplet in data_list:
    result = classifier(triplet[0])
    
    data_list_with_sentiment_distilbert.append((triplet[0], triplet[1], triplet[2], result[0]["label"], result[0]["score"]))
    
end = time.time()

print("The time to apply the DistilBERT model: " + str(math.ceil(end - start)) + " seconds")

The time to apply the DistilBERT model: 153 seconds


In [7]:
import plotly.express as px

In [22]:
df = pd.DataFrame(data_list_with_sentiment, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

# Group the data by Search_Key-Words and sentiment, and count the number of occurrences for each group
grouped_data = df.groupby(['Search_Key-Words', 'sentiment']).size().reset_index(name='count')

# Plot a bar chart of the sentiment counts, grouped by Search_Key-Words
fig = px.bar(grouped_data, x='Search_Key-Words', y='count', color='sentiment',
             title="Sentiment Analysis Results using CardiffNLP's RoBERTa Twitter Sentiment Model")

fig.update_layout(xaxis_title='Key Words', yaxis_title='Count')

fig.update_traces(width=0.5)
fig.update_layout(xaxis={'tickangle': 45})

fig.show()

In [21]:
# Define a function to categorize sentiment scores into confidence levels
def categorize_score(score):
    if score >= 0.9:
        return '90% Confidence'
    elif score >= 0.8:
        return '80% Confidence'
    elif score >= 0.7:
        return '70% Confidence'
    else:
        return 'Low Confidence'

df = pd.DataFrame(data_list_with_sentiment, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

# Apply the function to create a new column 'confidence_level'
df['confidence_level'] = df['score'].apply(categorize_score)

# Group the data by Search_Key-Words and sentiment, and count the number of occurrences for each group
grouped_data = df.groupby(['sentiment', 'confidence_level']).size().reset_index(name='count')

# Plot a bar chart of the sentiment counts, grouped by Search_Key-Words
fig = px.bar(grouped_data, x='confidence_level', y='count', color='sentiment',
             title="CardiffNLP's RoBERTa Model Categorized by Confidence Level")

fig.update_layout(xaxis_title='Key Words', yaxis_title='Count')

fig.update_traces(width=0.5)
fig.update_layout(xaxis={'tickangle': 45})

fig.show()

In [20]:
df = pd.DataFrame(data_list_with_sentiment, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

grouped_data = df.groupby(['sentiment', pd.Grouper(key='date', freq='D')]).size().reset_index(name='count')
pivoted_data = grouped_data.pivot(index='date', columns='sentiment', values='count').fillna(0)

fig = px.line(pivoted_data, x=pivoted_data.index, y=['positive', 'negative', 'neutral'])
fig.update_layout(xaxis_title='Date', yaxis_title='Count', title="Sentiment Analysis Results over Time using CardiffNLP's RoBERTa Twitter Sentiment Model")

fig.show()


In [19]:
df = pd.DataFrame(data_list_with_sentiment_distilbert, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

# Group the data by Search_Key-Words and sentiment, and count the number of occurrences for each group
grouped_data = df.groupby(['Search_Key-Words', 'sentiment']).size().reset_index(name='count')

# Plot a bar chart of the sentiment counts, grouped by Search_Key-Words
fig2 = px.bar(grouped_data, x='Search_Key-Words', y='count', color='sentiment', 
             title="Sentiment Analysis Results using a DistilBERT model fine-tuned on the Stanford Sentiment Treebank (SST-2)")

fig2.update_layout(xaxis_title='Key Words', yaxis_title='Count')

fig2.update_traces(width=0.5)
fig2.update_layout(xaxis={'tickangle': 45})

fig2.show()

In [18]:
df = pd.DataFrame(data_list_with_sentiment_distilbert, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

# Apply the function to create a new column 'confidence_level'
df['confidence_level'] = df['score'].apply(categorize_score)

# Group the data by Search_Key-Words and sentiment, and count the number of occurrences for each group
grouped_data = df.groupby(['sentiment', 'confidence_level']).size().reset_index(name='count')

# Plot a bar chart of the sentiment counts, grouped by Search_Key-Words
fig = px.bar(grouped_data, x='confidence_level', y='count', color='sentiment',
             title="DistilBERT Model Categorized by Confidence Level")

fig.update_layout(xaxis_title='Key Words', yaxis_title='Count')

fig.update_traces(width=0.5)
fig.update_layout(xaxis={'tickangle': 45})


fig.show()

In [17]:
df = pd.DataFrame(data_list_with_sentiment_distilbert, columns=['text', 'Search_Key-Words', 'date', 'sentiment', 'score'])

grouped_data = df.groupby(['sentiment', pd.Grouper(key='date', freq='D')]).size().reset_index(name='count')
pivoted_data = grouped_data.pivot(index='date', columns='sentiment', values='count').fillna(0)

fig = px.line(pivoted_data, x=pivoted_data.index, y=['POSITIVE', 'NEGATIVE'])
fig.update_layout(xaxis_title='Date', yaxis_title='Count', title="Sentiment Analysis Results over Time using DistilBERT Sentiment Model")

fig.show()