In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, http

In [3]:
import pandas as pd
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split

# Data Pre-Processing

## Poem Data Split

In [4]:
# poem_data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/PERC_mendelly.xlsx')
poem_data = pd.read_excel('./PERC_mendelly.xlsx')
labels_to_drop = ["peace", "courage", "hate"]
indices = poem_data[poem_data["Emotion"].isin(labels_to_drop)].index
poem_data["Emotion"] = poem_data["Emotion"].replace("sad", "sadness")
poem_data.drop(indices, inplace=True)
# poem_data.to_csv("poem_cleaned.csv")
poem_data = poem_data.rename(columns={"Poem": 'text', "Emotion": 'emotion'})
poem_data

Unnamed: 0,text,emotion
0,A Tree\nA tree beside the sandy river-beach \n...,sadness
1,"Sri Krishna\n\nO immense Light and thou, O spi...",love
3,Revelation\n\n\nSomeone leaping from the rocks...,sadness
4,The Silver Call\n\n\nThere is a godhead of unr...,joy
5,Surrender\n\nO THOU of whom I am the instrumen...,love
...,...,...
711,Daughter Taken By Mothers Lies\n\nHave you any...,sadness
712,Involuntary Acceptance\n\nEven though\nWe’re f...,sadness
713,Victim Of Poverty\n\nPoverty stricken youth ju...,sadness
714,Rain\n\nI sit and watch\nas the rain falls \nf...,sadness


In [5]:
# Split the dataset into training and testing sets
poem_train_data, poem_test_data = train_test_split(poem_data, test_size=0.9, random_state=42)

# Split the training set into training and validation sets
poem_test_data, poem_val_data = train_test_split(poem_test_data, test_size=0.5, random_state=42)

In [6]:
poem_test_data.to_csv('poem_test.txt', index=False)

## Twitter Message Data Split

In [7]:
# text_data_train = pd.read_table('/content/drive/MyDrive/Colab Notebooks/train.txt', sep=';', header=None)
text_data_train = pd.read_table('./train.txt', sep=';', header=None)
text_data_train = text_data_train.rename(columns={0: 'text', 1: 'emotion'})
text_data_train.to_csv("train_cleaned.txt")
text_data_train

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [8]:
# text_data_test = pd.read_table('/content/drive/MyDrive/Colab Notebooks/test.txt', sep=';', header=None)
text_data_test = pd.read_table('./test.txt', sep=';', header=None) 
text_data_test = text_data_test.rename(columns={0: 'text', 1: 'emotion'})
text_data_test.to_csv('text_test.txt', index=False)
print(set(text_data_train["emotion"]))

{'anger', 'fear', 'joy', 'sadness', 'love', 'surprise'}


In [9]:
# text_data_val = pd.read_table('/content/drive/MyDrive/Colab Notebooks/val.txt', sep=';', header=None)
text_data_val = pd.read_table('./val.txt', sep=';', header=None)
text_data_val = text_data_val.rename(columns={0: 'text', 1: 'emotion'})
text_data_val.to_csv('text_val_cleaned.txt', index=False)
print(set(text_data_val["emotion"]))

{'surprise', 'fear', 'joy', 'sadness', 'love', 'anger'}


## Reddit Data Split

In [10]:
from datasets import load_dataset

dataset = load_dataset("go_emotions", "raw")
columns_to_remove = ["id", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id", "example_very_unclear",
                    'admiration', 'amusement', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'gratitude', 'grief', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'neutral']

dataset_dict = dataset.remove_columns(columns_to_remove)

# Convert dataset to pandas DataFrame
df = pd.DataFrame(dataset_dict["train"])

# Count the number of 1s in each row
counts = df.iloc[:, 1:].sum(axis=1)

# Filter the rows where the count is less than or equal to 1
reddit_data = df[counts == 1]

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.11k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/raw to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
print(reddit_data)

                                                     text  anger  fear  joy  \
0                                         That game hurt.      0     0    0   
3                                      Man I love reddit.      0     0    0   
37      I just came home, what the fuck is this lineup...      0     0    0   
43      By far the coolest thing I've seen on this thr...      0     0    1   
49                     Sending love and strength vibes <3      0     0    1   
...                                                   ...    ...   ...  ...   
211187  I just called the Capitol Police. They are not...      1     0    0   
211212    What a great photo and you two look so happy. 😍      0     0    1   
211219  Well, I'm glad you're out of all that now. How...      0     0    1   
211220                             Everyone likes [NAME].      0     0    0   
211223  The FDA has plenty to criticize. But like here...      1     0    0   

        love  sadness  surprise  
0          0     

In [12]:
# Reshape the dataframe
melted_df = pd.melt(reddit_data, id_vars=['text'], var_name='emotion')

# Filter the rows where the value is 1
filtered_df = melted_df[melted_df['value'] == 1]

# Drop the 'value' column
reddit_df = filtered_df.drop('value', axis=1)

reddit_df = reddit_df.reset_index(drop=True)

reddit_df

Unnamed: 0,text,emotion
0,"""Oh, how DARE you discuss the disgustingly unh...",anger
1,Press fucken charges the moment they use your ...,anger
2,Shut up 15 year-old,anger
3,I thought it was very good. The guy was one si...,anger
4,Lol. Build your own or make some tweaks to the...,anger
...,...,...
36817,> you're probably a little bit of a closeted n...,surprise
36818,> you're probably a little bit of a closeted n...,surprise
36819,OH YEAH!!,surprise
36820,Yeh? It just changed on me last night after I ...,surprise


In [13]:
# Split the dataset into training and testing sets
reddit_train_data, reddit_test_data = train_test_split(reddit_df, test_size=0.9, random_state=42)

# Split the training set into training and validation sets
reddit_test_data, reddit_val_data = train_test_split(reddit_test_data, test_size=0.5, random_state=42)

In [14]:
reddit_test_data.to_csv('reddit_test.txt', index=False)

## Combine Train and Val

In [15]:
def combine_df(d1, d2, d3, filename):
  # Concatenate the dataframes vertically
  combined_df = pd.concat([d1, d2, d3], ignore_index=True)

  # Save the result to a new file
  combined_df.to_csv(filename, index=False)

In [16]:
combine_df(text_data_train, poem_train_data, reddit_train_data, "final_train.txt")

In [17]:
combine_df(text_data_val, poem_val_data, reddit_val_data, "final_val.txt")

# Model - RoBERTa 

In [18]:
!pip install -U transformers
!pip install -U datasets

import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from datasets import load_metric
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Prepare Dataset for Training or Testing

In [19]:
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def get_ds(filename):
  # Load the cleaned poem dataset
  train_data = pd.read_csv(filename, delimiter=',')
  train_texts = train_data["text"].tolist()
  train_labels = train_data["emotion"].tolist()

  # Tokenize the cleaned poem dataset
  model_name = "roberta-base"
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")

  # Encode the labels for the cleaned poem dataset
  encoder = LabelBinarizer()
  train_labels = encoder.fit_transform(train_labels)
  train_tensor = torch.tensor(train_labels).argmax(-1)

  # Create the dataset
  train_ds = EmotionDataset(train_encodings, train_tensor)

  label_to_emotion = {i: label for i, label in enumerate(encoder.classes_)}

  return train_ds, label_to_emotion

## Test Model

In [20]:
def test_model(filepath):
  # Load the saved fine-tuned model and tokenizer
  model_test = RobertaForSequenceClassification.from_pretrained(filepath)
  tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

  # Define the compute_metrics function for calculating accuracy
  def compute_metrics(pred):
      labels = pred.label_ids
      preds = pred.predictions.argmax(-1)
      acc = accuracy_score(labels, preds)
      return {"accuracy": acc}

  # Create a Trainer object with the loaded model
  test_model = Trainer(
      model=model_test,
      compute_metrics=compute_metrics,
  )

  # Evaluate the model on the test dataset
  test_ds, label_to_emotion = get_ds("./text_test.txt")
  test_results = test_model.evaluate(test_ds)
  print("Test accuracy:", test_results["eval_accuracy"])
  get_label_wise(test_model, test_ds, label_to_emotion)

  print(" -----------------------------POEM DATASET--------------------------------------- ")
  poem_test_ds, label_to_emotion = get_ds("./poem_test.txt")
  poem_test_results = test_model.evaluate(poem_test_ds)
  print("Test accuracy:", poem_test_results["eval_accuracy"])
  get_label_wise(test_model, poem_test_ds, label_to_emotion)

  print(" -----------------------------REDDIT DATASET--------------------------------------- ")
  reddit_test_ds, label_to_emotion = get_ds("./reddit_test.txt")
  reddit_test_results = test_model.evaluate(reddit_test_ds)
  print("Test accuracy:", reddit_test_results["eval_accuracy"])
  get_label_wise(test_model, reddit_test_ds, label_to_emotion)

In [28]:
def get_label_wise(trainer, dataset, label_to_emotion):
  # # Perform predictions on the test dataset
  predictions = trainer.predict(dataset)
  
  # Get the predicted class indices
  predicted_class_indices = torch.argmax(torch.tensor(predictions.predictions), dim=-1)
  unique_labels = set(label_to_emotion.values())
  unique_labels = sorted(list(unique_labels))
  # emotion_to_label = {emotion: label for label, emotion in label_to_emotion.items()}
  print(unique_labels)

  # Initialize dictionaries to store the true positive, false negative, and false positive counts for each label
  true_positive_count = {label: 0 for label in unique_labels}
  false_negative_count = {label: 0 for label in unique_labels}
  false_positive_count = {label: 0 for label in unique_labels}

  # # Calculate the true positive, false negative, and false positive counts for each label
  for true_label_idx, predicted_label_idx in zip(dataset.labels, predicted_class_indices):
    true_label = label_to_emotion[int(true_label_idx)]
    predicted_label = label_to_emotion[int(predicted_label_idx)]
    
    if true_label == predicted_label:
        true_positive_count[true_label] += 1
    else:
        false_negative_count[true_label] += 1
        false_positive_count[predicted_label] += 1

  # Calculate the true positive rate (sensitivity) and positive predictive value (precision) for each label
  label_wise_ppv = {
      label: (true_positive_count[label] / (true_positive_count[label] + false_negative_count[label])) * 100
      for label in unique_labels
  }

  label_wise_tpr = {
      label: (true_positive_count[label] / (true_positive_count[label] + false_positive_count[label])) * 100
      for label in unique_labels if (true_positive_count[label] + false_positive_count[label]) > 0
  }

  for label in unique_labels:
    if label not in label_wise_ppv:
      label_wise_ppv[label] = 0

  # Print the true positive rate (sensitivity) and positive predictive value (precision)
  print("Label-wise true positive rate:")
  for label, tpr in label_wise_tpr.items():
      print(f"{label}: {tpr:.2f}%")

  print("\nLabel-wise accuracy:")
  for label, ppv in label_wise_ppv.items():
      print(f"{label}: {ppv:.2f}%")
  
  # Calculate and print the confusion matrix
  true_labels = [label_to_emotion[int(label_idx)] for label_idx in dataset.labels]
  predicted_labels = [label_to_emotion[int(label_idx)] for label_idx in predicted_class_indices]
  cm = confusion_matrix(true_labels, predicted_labels, labels=list(unique_labels))
  print("\nConfusion matrix:")
  print(cm)

## Train Model

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
)

### Only Twitter Message Dataset


In [None]:
train_ds = get_ds("./train_cleaned.txt")
val_ds = get_ds("./text_val_cleaned.txt")
model1 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

trainer1 = Trainer(
    model=model1,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer1.train()

model1.save_pretrained('/content/drive/MyDrive/Models/roberta_model1')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Step,Training Loss,Validation Loss,Accuracy
1000,0.5017,0.223983,0.935
2000,0.1831,0.14971,0.938
3000,0.1211,0.17535,0.939


### Train Combined Dataset

In [None]:
combined_train_ds = get_ds("./final_train.txt")
combined_val_ds = get_ds("./final_val.txt")

model2 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=combined_train_ds,
    eval_dataset=combined_val_ds,
    compute_metrics=compute_metrics,
)

trainer2.train()

model2.save_pretrained('/content/drive/MyDrive/Models/roberta_model2')

## Results

### When Trained only on Twitter Message Data

In [29]:
test_model('/content/drive/MyDrive/Models/roberta_model1')

Test accuracy: 0.93
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 92.14%
fear: 88.41%
joy: 97.70%
love: 74.53%
sadness: 96.42%
surprise: 97.22%

Label-wise accuracy:
anger: 93.82%
fear: 91.96%
joy: 91.80%
love: 99.37%
sadness: 97.25%
surprise: 53.03%

Confusion matrix:
[[258   5   3   0   9   0]
 [  9 206   0   0   9   0]
 [  3   0 638  52   1   1]
 [  0   0   1 158   0   0]
 [ 10   4   1   1 565   0]
 [  0  18  10   1   2  35]]
 -----------------------------POEM DATASET--------------------------------------- 


Test accuracy: 0.27800829875518673
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 15.38%
fear: 4.76%
joy: 29.31%
love: 39.34%
sadness: 36.73%
surprise: 40.00%

Label-wise accuracy:
anger: 16.67%
fear: 20.00%
joy: 32.08%
love: 36.92%
sadness: 22.50%
surprise: 22.22%

Confusion matrix:
[[ 4  2  2  3 13  0]
 [ 2  2  1  2  3  0]
 [ 3 12 17 13  5  3]
 [ 2 13 18 24  8  0]
 [13 11 20 18 18  0]
 [ 2  2  0  1  2  2]]
 -----------------------------REDDIT DATASET--------------------------------------- 


Test accuracy: 0.4644538322269161
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 41.15%
fear: 36.53%
joy: 43.91%
love: 66.31%
sadness: 54.54%
surprise: 60.56%

Label-wise accuracy:
anger: 66.45%
fear: 50.90%
joy: 62.09%
love: 32.05%
sadness: 41.43%
surprise: 18.54%

Confusion matrix:
[[2327  224  418   78  420   35]
 [ 228  651  125   19  251    5]
 [ 544  161 2046  288   89  167]
 [1022  130  977 1067   93   40]
 [ 710  329  486   97 1172   35]
 [ 824  287  608   60  124  433]]


### When trained on 10% of Poem and Reddit Dataset

In [30]:
test_model('/content/drive/MyDrive/Models/roberta_model')

Test accuracy: 0.93
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 94.42%
fear: 86.29%
joy: 94.98%
love: 83.23%
sadness: 95.39%
surprise: 94.87%

Label-wise accuracy:
anger: 92.36%
fear: 95.54%
joy: 95.25%
love: 84.28%
sadness: 96.21%
surprise: 56.06%

Confusion matrix:
[[254   7   1   0  13   0]
 [  4 214   0   0   5   1]
 [  2   0 662  25   5   1]
 [  1   0  23 134   1   0]
 [  8   8   4   2 559   0]
 [  0  19   7   0   3  37]]
 -----------------------------POEM DATASET--------------------------------------- 


Test accuracy: 0.42323651452282157
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 14.29%
fear: 5.88%
joy: 48.57%
love: 47.12%
sadness: 46.03%
surprise: 50.00%

Label-wise accuracy:
anger: 8.33%
fear: 10.00%
joy: 32.08%
love: 75.38%
sadness: 36.25%
surprise: 44.44%

Confusion matrix:
[[ 2  1  0  6 14  1]
 [ 1  1  0  4  4  0]
 [ 1  5 17 17 11  2]
 [ 1  4  6 49  5  0]
 [ 8  6 10 26 29  1]
 [ 1  0  2  2  0  4]]
 -----------------------------REDDIT DATASET--------------------------------------- 


Test accuracy: 0.7307785153892576
['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
Label-wise true positive rate:
anger: 66.96%
fear: 63.30%
joy: 75.20%
love: 79.34%
sadness: 78.11%
surprise: 71.99%

Label-wise accuracy:
anger: 78.36%
fear: 68.65%
joy: 68.38%
love: 88.62%
sadness: 61.93%
surprise: 65.58%

Confusion matrix:
[[2744  157  138   68  198  197]
 [ 209  878   47   17   73   55]
 [ 195   55 2253  493   90  209]
 [  81   18  200 2950   43   37]
 [ 547  186  133  113 1752   98]
 [ 322   93  225   77   87 1532]]
