# GoEmotions train (baseline) experiment

In [1]:
from datasets import Dataset, load_dataset, DatasetDict, ClassLabel
import numpy as np
import torch
import pandas as pd

#Seeding for deterministic results i.e. showing same output 
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[~df['function'].isin(['NOT_INTERESTED', 'BORED','NOT_IMPORTANT', 'REFUSE', 'WARN', 
                              'COMPLAIN', 'THREATEN', 'UNWILLING', 'DISTRUST' ])].rename(columns={"function": "functions"})


In [3]:
unique_labels = df["functions"].unique().tolist()
function_labels = ClassLabel(names=unique_labels)

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit([function_labels.names])
labels_classes = mlb.classes_
labels_classes

array(['ANGRY', 'DISAGREE', 'DISAPPOINTED', 'DISLIKE', 'NOT_APPROVE',
       'NOT_CORRECT', 'OTHER', 'PESSIMISTIC', 'WORRIED'], dtype=object)

In [4]:
function_mapping = {'OTHER': ['admiration', 'approval', 'gratitude', 'amusement', 'curiosity', 'embarrassment', 'love', 'optimism', 'joy', 
                              'realization', 'confusion', 'caring', 'excitement', 'surprise', 'desire', 'pride', 'relief', 'neutral'], 
           'NOT_INTERESTED': [''], 
           'DISLIKE':['disgust', 'annoyance'], 
           'NOT_CORRECT': ['disapproval'], 
           'PESSIMISTIC':['grief', 'remorse', 'sadness'], 
           'WORRIED':['fear', 'nervousness'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': ['disappointment'], 
           'BORED': [''], 
           'NOT_APPROVE':['disapproval'], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': ['disapproval'], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

mapping = {
    0:"admiration",
    1:"amusement",
    2:"anger",
    3:"annoyance",
    4:"approval",
    5:"caring",
    6:"confusion",
    7:"curiosity",
    8:"desire",
    9:"disappointment",
    10:"disapproval",
    11:"disgust",
    12:"embarrassment",
    13:"excitement",
    14:"fear",
    15:"gratitude",
    16:"grief",
    17:"joy",
    18:"love",
    19:"nervousness",
    20:"optimism",
    21:"pride",
    22:"realization",
    23:"relief",
    24:"remorse",
    25:"sadness",
    26:"surprise",
    27:"neutral",
}

go_emotions = load_dataset("go_emotions").remove_columns("id")
def adjust_labels(batch):
    batch['emotions'] = [[] for _ in range(len(batch['text']))]
    for i, item in enumerate(batch["labels"]):
        batch["emotions"][i] = [mapping[label] for label in batch["labels"][i]]
    return batch

go_emotions = go_emotions.map(adjust_labels, batched=True)
def map_labels(batch):
    batch['functions'] = [[] for _ in range(len(batch['text']))]
    for i, item in enumerate(batch["emotions"]):
        batch["functions"][i] = [key for key,value in function_mapping.items() if any(emotion in batch["emotions"][i] for emotion in value)]
    return batch
go_emotions = go_emotions.map(map_labels, batched=True).remove_columns("labels")

# Convert labels to one hot enconding vectors
def one_hot_encoding(batch):
    label_ids = mlb.transform(batch['functions'])
    return {"label_ids":label_ids}
go_emotions = go_emotions.map(one_hot_encoding, batched=True)

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (/home/njfernandez/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)
100%|██████████| 3/3 [00:00<00:00, 773.90it/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-1c4d70c4ecc5a4bd.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-8f90b5d8ac946ce8.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-f0dc721270e34384.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/go_emotions/simplified

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

y_train = np.array(go_emotions['train']['label_ids'])
y_test = np.array(go_emotions['test']['label_ids'])

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(go_emotions['train']['text'])
X_test_counts = count_vect.transform(go_emotions['test']['text'])

In [7]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [8]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.33      0.01      0.02       198
    DISAGREE       0.00      0.00      0.00       267
DISAPPOINTED       0.00      0.00      0.00       151
     DISLIKE       0.62      0.06      0.10       429
 NOT_APPROVE       0.00      0.00      0.00       267
 NOT_CORRECT       0.00      0.00      0.00       267
       OTHER       0.84      0.98      0.90      4386
 PESSIMISTIC       0.62      0.04      0.07       212
     WORRIED       0.00      0.00      0.00        98

   micro avg       0.83      0.69      0.75      6275
   macro avg       0.27      0.12      0.12      6275
weighted avg       0.66      0.69      0.64      6275
 samples avg       0.80      0.78      0.78      6275



In [9]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_baseline_go_emotions_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [10]:
df_test = go_emotions['test'].to_pandas()
cols = ["text", "emotions", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

Unnamed: 0,text,emotions,functions,predicted_labels
0,I’m really sorry about your situation :( Altho...,[sadness],[PESSIMISTIC],[OTHER]
1,It's wonderful because it's awful. At not with.,[admiration],[OTHER],[OTHER]
2,"Kings fan here, good luck to you guys! Will be...",[excitement],[OTHER],[OTHER]
3,"I didn't know that, thank you for teaching me ...",[gratitude],[OTHER],[OTHER]
4,They got bored from haunting earth for thousan...,[neutral],[OTHER],[OTHER]
...,...,...,...,...
5422,Thanks. I was diagnosed with BP 1 after the ho...,[gratitude],[OTHER],[OTHER]
5423,Well that makes sense.,[approval],[OTHER],[OTHER]
5424,Daddy issues [NAME],[neutral],[OTHER],[OTHER]
5425,So glad I discovered that subreddit a couple m...,[admiration],[OTHER],[OTHER]


In [11]:
df_test.to_csv('preds_baseline_go_emotions_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function basic experiment

In [12]:
df["functions"] = df["functions"].apply(lambda x: [x])

In [13]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": go_emotions['test']
}
    )
dataset['train'] = dataset['train'].map(one_hot_encoding, batched=True)
dataset

100%|██████████| 1/1 [00:00<00:00, 218.23ba/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'functions', 'label_ids'],
        num_rows: 629
    })
    test: Dataset({
        features: ['text', 'emotions', 'functions', 'label_ids'],
        num_rows: 5427
    })
})

In [14]:
y_train = np.array(dataset['train']['label_ids'])
y_test = np.array(dataset['test']['label_ids'])

In [15]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset['train']['text'])
X_test_counts = count_vect.transform(dataset['test']['text'])

In [16]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [17]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.05      0.08      0.06       198
    DISAGREE       0.05      0.06      0.05       267
DISAPPOINTED       0.05      0.05      0.05       151
     DISLIKE       0.11      0.06      0.08       429
 NOT_APPROVE       0.05      0.04      0.04       267
 NOT_CORRECT       0.07      0.11      0.09       267
       OTHER       0.83      0.45      0.58      4386
 PESSIMISTIC       0.02      0.02      0.02       212
     WORRIED       0.05      0.20      0.08        98

   micro avg       0.44      0.33      0.38      6275
   macro avg       0.14      0.12      0.12      6275
weighted avg       0.60      0.33      0.42      6275
 samples avg       0.33      0.37      0.34      6275



In [18]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_basic_go_emotions_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [19]:
df_test = go_emotions['test'].to_pandas()
cols = ["text", "emotions", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

Unnamed: 0,text,emotions,functions,predicted_labels
0,I’m really sorry about your situation :( Altho...,[sadness],[PESSIMISTIC],[WORRIED]
1,It's wonderful because it's awful. At not with.,[admiration],[OTHER],[]
2,"Kings fan here, good luck to you guys! Will be...",[excitement],[OTHER],"[OTHER, PESSIMISTIC]"
3,"I didn't know that, thank you for teaching me ...",[gratitude],[OTHER],[OTHER]
4,They got bored from haunting earth for thousan...,[neutral],[OTHER],[]
...,...,...,...,...
5422,Thanks. I was diagnosed with BP 1 after the ho...,[gratitude],[OTHER],[DISAGREE]
5423,Well that makes sense.,[approval],[OTHER],[]
5424,Daddy issues [NAME],[neutral],[OTHER],[]
5425,So glad I discovered that subreddit a couple m...,[admiration],[OTHER],[OTHER]


In [20]:
df_test.to_csv('preds_functions_basic_go_emotions_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function extended experiment

In [21]:
comfunctions_extended = 'comfunct_extended.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[~df['function'].isin(['NOT_INTERESTED', 'BORED','NOT_IMPORTANT', 'REFUSE', 'WARN', 
                              'COMPLAIN', 'THREATEN', 'UNWILLING', 'DISTRUST' ])].rename(columns={"function": "functions"})


In [22]:
df["functions"] = df["functions"].apply(lambda x: [x])

In [23]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": go_emotions['test']
}
    )
dataset['train'] = dataset['train'].map(one_hot_encoding, batched=True)
dataset

100%|██████████| 1/1 [00:00<00:00, 246.48ba/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'functions', 'label_ids'],
        num_rows: 629
    })
    test: Dataset({
        features: ['text', 'emotions', 'functions', 'label_ids'],
        num_rows: 5427
    })
})

In [24]:
y_train = np.array(dataset['train']['label_ids'])
y_test = np.array(dataset['test']['label_ids'])

In [25]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset['train']['text'])
X_test_counts = count_vect.transform(dataset['test']['text'])

In [26]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [27]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.05      0.08      0.06       198
    DISAGREE       0.05      0.06      0.05       267
DISAPPOINTED       0.05      0.05      0.05       151
     DISLIKE       0.11      0.06      0.08       429
 NOT_APPROVE       0.05      0.04      0.04       267
 NOT_CORRECT       0.07      0.11      0.09       267
       OTHER       0.83      0.45      0.58      4386
 PESSIMISTIC       0.02      0.02      0.02       212
     WORRIED       0.05      0.20      0.08        98

   micro avg       0.44      0.33      0.38      6275
   macro avg       0.14      0.12      0.12      6275
weighted avg       0.60      0.33      0.42      6275
 samples avg       0.33      0.37      0.34      6275



In [28]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_extended_go_emotions_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [29]:
df_test = go_emotions['test'].to_pandas()
cols = ["text", "emotions", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

Unnamed: 0,text,emotions,functions,predicted_labels
0,I’m really sorry about your situation :( Altho...,[sadness],[PESSIMISTIC],[WORRIED]
1,It's wonderful because it's awful. At not with.,[admiration],[OTHER],[]
2,"Kings fan here, good luck to you guys! Will be...",[excitement],[OTHER],"[OTHER, PESSIMISTIC]"
3,"I didn't know that, thank you for teaching me ...",[gratitude],[OTHER],[OTHER]
4,They got bored from haunting earth for thousan...,[neutral],[OTHER],[]
...,...,...,...,...
5422,Thanks. I was diagnosed with BP 1 after the ho...,[gratitude],[OTHER],[DISAGREE]
5423,Well that makes sense.,[approval],[OTHER],[]
5424,Daddy issues [NAME],[neutral],[OTHER],[]
5425,So glad I discovered that subreddit a couple m...,[admiration],[OTHER],[OTHER]


In [30]:
df_test.to_csv('preds_functions_extended_go_emotions_naive_bayes.tsv',header =True, sep = '\t',index=False)