In [None]:
import pandas as pd
import numpy as np
import seaborn as sb

In [None]:
# Set display options
pd.set_option('display.float_format', '{:.0f}'.format)

In [None]:
combined_df = pd.read_csv('replyTweets_combined_output.csv')

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282523 entries, 0 to 282522
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   282522 non-null  float64
 1   createdAt            282522 non-null  object 
 2   fullName             282483 non-null  object 
 3   userName             282522 non-null  object 
 4   profileImage         282522 non-null  object 
 5   fullText             282522 non-null  object 
 6   replyTo              282522 non-null  float64
 7   lang                 282522 non-null  object 
 8   quoteCount           282522 non-null  float64
 9   retweetCount         282522 non-null  float64
 10  replyCount           282522 non-null  float64
 11  likeCount            282522 non-null  float64
 12  viewCount            103417 non-null  float64
 13  sentimentLabel1      1790 non-null    float64
 14  sentimentLabel2      1201 non-null    float64
 15  sentimentLabel3  

In [None]:
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Encode emotion labels
label_encoder = LabelEncoder()
combined_df['emotion_label_encoded'] = label_encoder.fit_transform(combined_df['sentimentDetail'])

In [None]:
combined_df['emotion_label_encoded'].value_counts()

emotion_label_encoded
4    20733
0     6788
5     3967
6     3073
3     2763
2     1829
1     1327
7        1
Name: count, dtype: int64

In [None]:
labeled_data = combined_df[combined_df['sentimentLabelFinal'].notnull()]
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 996 entries, 152428 to 153426
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   996 non-null    float64
 1   createdAt            996 non-null    object 
 2   fullName             996 non-null    object 
 3   userName             996 non-null    object 
 4   profileImage         996 non-null    object 
 5   fullText             996 non-null    object 
 6   replyTo              996 non-null    float64
 7   lang                 996 non-null    object 
 8   quoteCount           996 non-null    float64
 9   retweetCount         996 non-null    float64
 10  replyCount           996 non-null    float64
 11  likeCount            996 non-null    float64
 12  viewCount            0 non-null      float64
 13  sentimentLabel1      996 non-null    float64
 14  sentimentLabel2      996 non-null    float64
 15  sentimentLabel3      996 non-null    

In [None]:
# Load the DataFrame
# Assuming your DataFrame is named df
df = combined_df

# Map emotion labels to numerical format
emotion_label_map = {"anger": 0, "fear": 1, "disgust": 2, "surprise": 3, "joy": 4, "neutral": 5}
df['emotion_label'] = df['sentimentDetail'].map(emotion_label_map)

# Ensure text inputs are provided as a list of strings
text_list = df['cleaned_tweet'].tolist()

# Ensure emotion labels are encoded as integers
emotion_labels = df['emotion_label'].tolist()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from sklearn.metrics import classification_report

# Load the DataFrame with text inputs and emotion labels
# Assuming your DataFrame is named df
df = labeled_data

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained RoBERTa model and tokenizer for sentiment analysis
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Map emotion labels to numerical format
emotion_label_map = {"anger": 0, "fear": 1, "disgust": 2, "surprise": 3, "joy": 4, "neutral": 5}
train_df['emotion_label'] = train_df['sentimentDetail'].map(emotion_label_map)
test_df['emotion_label'] = test_df['sentimentDetail'].map(emotion_label_map)

# Tokenize inputs for both train and test sets
train_encodings = tokenizer(list(train_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")

# Add emotion and sarcasm labels as input features
train_encodings['labels'] = torch.tensor(train_df['emotion_label'].tolist())
train_encodings['sarcasm_labels'] = torch.tensor(train_df['sarcasm'].tolist())
test_encodings['labels'] = torch.tensor(test_df['emotion_label'].tolist())
test_encodings['sarcasm_labels'] = torch.tensor(test_df['sarcasm'].tolist())

# Perform inference on the test set
model.eval()
with torch.no_grad():
    input_ids = test_encodings['input_ids']
    attention_mask = test_encodings['attention_mask']
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Get predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# Generate classification report
report = classification_report(test_df['sentimentLabelFinal'].tolist(), predicted_labels)

print("Classification Report:")
print(report)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Classification Report:
              precision    recall  f1-score   support

         0.0       0.43      0.73      0.54        64
         1.0       0.68      0.44      0.54       106
         2.0       0.86      0.60      0.71        30

    accuracy                           0.56       200
   macro avg       0.66      0.59      0.59       200
weighted avg       0.63      0.56      0.56       200



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from sklearn.metrics import classification_report

# Load the DataFrame with text inputs and emotion labels
# Assuming your DataFrame is named df
df = labeled_data

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained RoBERTa model and tokenizer for sentiment analysis
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Tokenize inputs for both train and test sets
train_encodings = tokenizer(list(train_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")

# Add emotion label as an input feature
train_encodings['labels'] = torch.tensor(train_df['sarcasm'].tolist())
test_encodings['labels'] = torch.tensor(test_df['sarcasm'].tolist())

# Perform inference on the test set
model.eval()
with torch.no_grad():
    input_ids = test_encodings['input_ids']
    attention_mask = test_encodings['attention_mask']
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Get predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# Generate classification report
report = classification_report(test_df['sentimentLabelFinal'].tolist(), predicted_labels)

print("Classification Report:")
print(report)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)