In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import torch.nn as nn
from transformers import RobertaModel
from tqdm import tqdm


In [2]:
# Set display options
pd.set_option('display.float_format', '{:.0f}'.format)

In [3]:
combined_df = pd.read_csv('replyTweets_combined_output.csv')

In [4]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287188 entries, 0 to 287187
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   287187 non-null  float64
 1   createdAt            287188 non-null  object 
 2   fullName             287149 non-null  object 
 3   userName             287188 non-null  object 
 4   profileImage         287188 non-null  object 
 5   fullText             287188 non-null  object 
 6   replyTo              287188 non-null  int64  
 7   lang                 287188 non-null  object 
 8   quoteCount           287188 non-null  int64  
 9   retweetCount         287188 non-null  int64  
 10  replyCount           287188 non-null  int64  
 11  likeCount            287188 non-null  int64  
 12  viewCount            103504 non-null  float64
 13  sentimentLabel1      1790 non-null    float64
 14  sentimentLabel2      1201 non-null    float64
 15  sentimentLabel3  

In [5]:
labeled_data = combined_df[combined_df['sentimentLabelFinal'].notnull()]
labeled_data = labeled_data[labeled_data['cleaned_tweet_vader'].notnull()]
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 987 entries, 152428 to 153426
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   987 non-null    float64
 1   createdAt            987 non-null    object 
 2   fullName             987 non-null    object 
 3   userName             987 non-null    object 
 4   profileImage         987 non-null    object 
 5   fullText             987 non-null    object 
 6   replyTo              987 non-null    int64  
 7   lang                 987 non-null    object 
 8   quoteCount           987 non-null    int64  
 9   retweetCount         987 non-null    int64  
 10  replyCount           987 non-null    int64  
 11  likeCount            987 non-null    int64  
 12  viewCount            0 non-null      float64
 13  sentimentLabel1      987 non-null    float64
 14  sentimentLabel2      987 non-null    float64
 15  sentimentLabel3      987 non-nul

In [6]:
labeled_data['sentimentLabelFinal'].value_counts()

1    490
0    354
2    143
Name: sentimentLabelFinal, dtype: int64

In [9]:
# oversample
X = labeled_data[['cleaned_tweet', 'cleaned_tweet_vader', 'sentimentDetail', 'sarcasm']]
y = labeled_data['sentimentLabelFinal']

# Initialize RandomOverSampler
ros = RandomOverSampler()

X_resampled, y_resampled = ros.fit_resample(X, y)

resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['sentimentLabelFinal'] = y_resampled

print(resampled_df['sentimentLabelFinal'].value_counts())


1    490
0    490
2    490
Name: sentimentLabelFinal, dtype: int64


In [10]:
# emotion label mapping to int
emotion_label_map = {"anger": 0, "fear": 1, "disgust": 2, "surprise": 3, "joy": 4, "neutral": 5, "sadness": 6}
resampled_df['emotion_label'] = resampled_df['sentimentDetail'].map(emotion_label_map)

In [12]:
# adding emotion labels and sarcasm labels as input features to train sentiment classification model

train_df, test_df = train_test_split(resampled_df, test_size=0.2, random_state=42)

# Load pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Tokenize inputs for both train and test sets
train_encodings = tokenizer(list(train_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")

# Add emotion and sarcasm labels as input features
train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
train_emotions = torch.tensor(train_df['emotion_label'].tolist(), dtype=torch.long)  # Assuming emotion is encoded as integers
train_sarcasms = torch.tensor(train_df['sarcasm'].tolist(), dtype=torch.long)  # Assuming sarcasm is encoded as integers
test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
test_emotions = torch.tensor(test_df['emotion_label'].tolist(), dtype=torch.long)
test_sarcasms = torch.tensor(test_df['sarcasm'].tolist(), dtype=torch.long)

# Define TensorDatasets for training and test
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels, train_emotions, train_sarcasms)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels, test_emotions, test_sarcasms)

# Define training parameters
batch_size = 32
epochs = 3
learning_rate = 5e-5

# Define model architecture
class CustomRobertaForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(CustomRobertaForSequenceClassification, self).__init__()
        self.roberta = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.classifier = nn.Linear(self.roberta.config.hidden_size + 2, num_labels)  # Additional 2 for emotion and sarcasm

    def forward(self, input_ids, attention_mask, emotion, sarcasm):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = torch.cat((pooled_output, emotion.unsqueeze(1), sarcasm.unsqueeze(1)), dim=1)  # Concatenate emotion and sarcasm
        logits = self.classifier(pooled_output)
        return logits

model = CustomRobertaForSequenceClassification(num_labels=3)

optimizer = AdamW(model.parameters(), lr=learning_rate)

loss_fn = torch.nn.CrossEntropyLoss()

# Prepare DataLoader for training and test
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training loop with progress bar
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{epochs}')
    for step, batch in progress_bar:
        input_ids, attention_mask, labels, emotions, sarcasms = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, emotions, sarcasms)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.set_postfix({'training_loss': train_loss / (step + 1)})

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels, emotions, sarcasms = batch
            logits = model(input_ids, attention_mask, emotions, sarcasms)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

  # Print final classification report after all epochs
print()
print("Enhanced model Classification Report:")
print(classification_report(all_labels, all_preds))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
  test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████████████████████████████████████████████| 37/37 [07:33<00:00, 12.25s/it, training_loss=0.767]
Epoch 2/3: 100%|██████████████████████████████████████████████████| 37/37 [07:25<00:00, 12.05s/it, training_loss=0.372]
Epoch 3/3: 100%|██████████████████████████████████████████████████| 37/37 [07:13<00:00, 11.72s/it, training_loss=0.176]



Enhanced model Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       107
           1       0.71      0.65      0.68        97
           2       0.81      0.96      0.88        90

    accuracy                           0.78       294
   macro avg       0.78      0.78      0.78       294
weighted avg       0.78      0.78      0.77       294



In [13]:
# adding sarcasm labels as input features to train sentiment classification model

train_df, test_df = train_test_split(resampled_df, test_size=0.2, random_state=42)

# Load pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Tokenize inputs for both train and test sets
train_encodings = tokenizer(list(train_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")

# Add sarcasm labels as input features
train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
train_sarcasms = torch.tensor(train_df['sarcasm'].tolist(), dtype=torch.long)  # Assuming sarcasm is encoded as integers
test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
test_sarcasms = torch.tensor(test_df['sarcasm'].tolist(), dtype=torch.long)

# Define TensorDatasets for training and test
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels, train_sarcasms)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels, test_sarcasms)

# Define training parameters
batch_size = 32
epochs = 3
learning_rate = 5e-5

# Define model architecture
class CustomRobertaForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(CustomRobertaForSequenceClassification, self).__init__()
        self.roberta = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.classifier = nn.Linear(self.roberta.config.hidden_size + 1, num_labels)  # Additional 1 for sarcasm

    def forward(self, input_ids, attention_mask, sarcasm):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = torch.cat((pooled_output, sarcasm.unsqueeze(1)), dim=1)  # Concatenate sarcasm
        logits = self.classifier(pooled_output)
        return logits

model = CustomRobertaForSequenceClassification(num_labels=3)

optimizer = AdamW(model.parameters(), lr=learning_rate)

loss_fn = torch.nn.CrossEntropyLoss()

# Prepare DataLoader for training and test
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training loop with progress bar
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{epochs}')
    for step, batch in progress_bar:
        input_ids, attention_mask, labels, sarcasms = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, sarcasms)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.set_postfix({'training_loss': train_loss / (step + 1)})

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels, sarcasms = batch
            logits = model(input_ids, attention_mask, sarcasms)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

# Print final classification report after all epochs
print()
print("Sarcasm Enhanced Classification Report:")
print(classification_report(all_labels, all_preds))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
  test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████████████████████████████████████████████| 37/37 [07:15<00:00, 11.76s/it, training_loss=0.718]
Epoch 2/3: 100%|██████████████████████████████████████████████████| 37/37 [07:07<00:00, 11.56s/it, training_loss=0.348]
Epoch 3/3: 100%|██████████████████████████████████████████████████| 37/37 [07:03<00:00, 11.44s/it, training_loss=0.178]



Sarcasm Enhanced Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76       107
           1       0.70      0.64      0.67        97
           2       0.87      0.92      0.90        90

    accuracy                           0.78       294
   macro avg       0.78      0.78      0.78       294
weighted avg       0.77      0.78      0.77       294



In [14]:
# adding emotion labels as input features to train sentiment classification model

train_df, test_df = train_test_split(resampled_df, test_size=0.2, random_state=42)

# Load pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Tokenize inputs for both train and test sets
train_encodings = tokenizer(list(train_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df['cleaned_tweet']), padding=True, truncation=True, return_tensors="pt")

# Add emotion labels as input features
train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
train_emotions = torch.tensor(train_df['emotion_label'].tolist(), dtype=torch.long)  # Assuming emotion is encoded as integers
test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
test_emotions = torch.tensor(test_df['emotion_label'].tolist(), dtype=torch.long)

# Define TensorDatasets for training and test
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels, train_emotions)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels, test_emotions)

# Define training parameters
batch_size = 32
epochs = 3
learning_rate = 5e-5

# Define model architecture
class CustomRobertaForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(CustomRobertaForSequenceClassification, self).__init__()
        self.roberta = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.classifier = nn.Linear(self.roberta.config.hidden_size + 1, num_labels)  # Additional 1 for emotion

    def forward(self, input_ids, attention_mask, emotion):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = torch.cat((pooled_output, emotion.unsqueeze(1)), dim=1)  # Concatenate emotion
        logits = self.classifier(pooled_output)
        return logits

model = CustomRobertaForSequenceClassification(num_labels=3)

optimizer = AdamW(model.parameters(), lr=learning_rate)

loss_fn = torch.nn.CrossEntropyLoss()

# Prepare DataLoader for training and test
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training loop with progress bar
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{epochs}')
    for step, batch in progress_bar:
        input_ids, attention_mask, labels, emotions = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, emotions)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.set_postfix({'training_loss': train_loss / (step + 1)})

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels, emotions = batch
            logits = model(input_ids, attention_mask, emotions)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

# Print final classification report after all epochs
print()
print("Emotion Enhanced Classification Report:")
print(classification_report(all_labels, all_preds))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  train_labels = torch.tensor(train_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
  test_labels = torch.tensor(test_df['sentimentLabelFinal'].tolist(), dtype=torch.long)
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|████████████████████████████████████████████████████| 37/37 [06:59<00:00, 11.34s/it, training_loss=0.7]
Epoch 2/3: 100%|██████████████████████████████████████████████████| 37/37 [07:02<00:00, 11.42s/it, training_loss=0.388]
Epoch 3/3: 100%|██████████████████████████████████████████████████| 37/37 [07:00<00:00, 11.37s/it, training_loss=0.203]



Emotion Enhanced Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       107
           1       0.72      0.53      0.61        97
           2       0.79      0.94      0.86        90

    accuracy                           0.76       294
   macro avg       0.75      0.76      0.75       294
weighted avg       0.75      0.76      0.75       294

