# GoEmotions train (baseline) experiment

In [1]:
from datasets import Dataset, load_dataset, DatasetDict, ClassLabel, concatenate_datasets
import numpy as np
import torch
import pandas as pd

#Seeding for deterministic results i.e. showing same output 
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'WORRIED', 'PESSIMISTIC', 'DISLIKE', 'OTHER'])].rename(columns={"function": "functions"})
df

Unnamed: 0,text,functions
85,I'm not really very keen on it,DISLIKE
86,I'm not really very keen,DISLIKE
87,I'm not really keen on it,DISLIKE
88,I'm not really keen,DISLIKE
89,I'm not very keen on it,DISLIKE
...,...,...
897,you could do it,OTHER
898,you look nice,OTHER
899,you look smart,OTHER
900,you're right,OTHER


In [3]:
unique_labels = df["functions"].unique().tolist()
function_labels = ClassLabel(names=unique_labels)

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit([function_labels.names])
labels_classes = mlb.classes_
labels_classes

array(['ANGRY', 'DISLIKE', 'OTHER', 'PESSIMISTIC', 'WORRIED'],
      dtype=object)

In [4]:
function_mapping = {'OTHER': ['anticipation', 'joy', 'love', 'optimism', 'surprise', 'trust'],
           'NOT_INTERESTED': [''], 
           'DISLIKE':['disgust'], 
           'NOT_CORRECT': [''], 
           'PESSIMISTIC':['sadness', 'pessimism'], 
           'WORRIED':['fear'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': [''], 
           'BORED': [''], 
           'NOT_APPROVE':[''], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': [''], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

ait_es_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-Es-train.txt',
                                                        'test': r'2018-E-c-Es-test-gold.txt',
                                                       'valid': r'2018-E-c-Es-dev.txt'})
ait_en_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-En-train.txt',
                                                        'test': r'2018-E-c-En-test-gold.txt',
                                                       'valid': r'2018-E-c-En-dev.txt'})
ait_ar_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-Ar-train.txt',
                                                        'test': r'2018-E-c-Ar-test-gold.txt',
                                                       'valid': r'2018-E-c-Ar-dev.txt'})
train_dataset = concatenate_datasets([ait_es_dataset['train'], ait_en_dataset['train'], ait_ar_dataset['train']])
valid_dataset = concatenate_datasets([ait_es_dataset['valid'], ait_en_dataset['valid'], ait_ar_dataset['valid']])
test_dataset = concatenate_datasets([ait_es_dataset['test'], ait_en_dataset['test'], ait_ar_dataset['test']])

ait_dataset = DatasetDict({'train': train_dataset.shuffle(seed=42),'valid': valid_dataset.shuffle(seed=42), 'test': test_dataset.shuffle(seed=42)})

Using custom data configuration default-051610f9ffe8f8d2
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-051610f9ffe8f8d2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 785.65it/s]
Using custom data configuration default-94907eb33ba58000
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-94907eb33ba58000/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 791.13it/s]
Using custom data configuration default-869e6abb15784ff2
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-869e6abb15784ff2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 746.05it/s]
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-051610f9ffe8f8d2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f19

In [5]:
new_dataset = DatasetDict()
for split in ait_dataset:
    new_split = []
    for record in ait_dataset[split]:
        new_record = {'text': record['Tweet']}
        for function in function_mapping:
            labels = function_mapping[function]
            if '' in labels:
                continue
            else:
                score = sum([record[label] for label in labels])
                new_record[function] = int(score > 0)
        new_split.append(new_record)
    ait_dataset[split] =  Dataset.from_pandas(pd.DataFrame(data=new_split))

In [6]:
cols = ait_dataset['test'].column_names
ait_dataset = ait_dataset.map(lambda x : {"functions": [c for c in cols if x[c] == 1]})
ait_dataset['test'][20]

100%|██████████| 12675/12675 [00:00<00:00, 13350.51ex/s]
100%|██████████| 2150/2150 [00:00<00:00, 10290.27ex/s]
100%|██████████| 7631/7631 [00:00<00:00, 13364.87ex/s]


{'text': 'انا رايحة اعمل عملية تانى فى عنيا\\nادعولى\\nوقلبى وربى غضبانين عليك يا اللى تفرط فى ارضك',
 'OTHER': 0,
 'DISLIKE': 0,
 'PESSIMISTIC': 1,
 'WORRIED': 0,
 'ANGRY': 1,
 'functions': ['PESSIMISTIC', 'ANGRY']}

In [7]:
# Convert labels to one hot enconding vectors
def one_hot_encoding(batch):
    label_ids = mlb.transform(batch['functions'])
    return {"label_ids":label_ids}
ait_dataset = ait_dataset.map(one_hot_encoding, batched=True)

100%|██████████| 13/13 [00:00<00:00, 225.35ba/s]
100%|██████████| 3/3 [00:00<00:00, 274.05ba/s]
100%|██████████| 8/8 [00:00<00:00, 244.78ba/s]


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

y_train = np.array(ait_dataset['train']['label_ids'])
y_test = np.array(ait_dataset['test']['label_ids'])

In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(ait_dataset['train']['text'])
X_test_counts = count_vect.transform(ait_dataset['test']['text'])

In [10]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [11]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.74      0.48      0.58      2629
     DISLIKE       0.54      0.50      0.52      1838
       OTHER       0.75      0.73      0.74      3936
 PESSIMISTIC       0.64      0.42      0.51      2691
     WORRIED       0.69      0.22      0.33      1078

   micro avg       0.69      0.52      0.60     12172
   macro avg       0.67      0.47      0.53     12172
weighted avg       0.69      0.52      0.58     12172
 samples avg       0.62      0.57      0.57     12172



In [12]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_baseline_ait_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [15]:
df_test = ait_dataset['test'].to_pandas()
cols = ["text", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

Unnamed: 0,text,functions,predicted_labels
0,@kenjigi Jajaja ay soy iop vv :3,[OTHER],[OTHER]
1,Me esta doliendo la cabeza 🙁,"[PESSIMISTIC, ANGRY]",[]
2,٢٩ | رمضان 🌙 اللهم أجعل رمضان هذا العام خاتمه ...,[OTHER],"[ANGRY, PESSIMISTIC]"
3,Remember what God says about you is most impor...,[OTHER],[OTHER]
4,Usually love @KatiePrice but watching #mycrazy...,"[DISLIKE, PESSIMISTIC, ANGRY]","[DISLIKE, OTHER, PESSIMISTIC]"
...,...,...,...
7626,@AquaticQueenM -He laughs back- 'I may act all...,"[OTHER, PESSIMISTIC]","[ANGRY, DISLIKE]"
7627,My arms and legs abs tummy and back and face a...,"[PESSIMISTIC, ANGRY]","[DISLIKE, OTHER]"
7628,#muggymike #revenge oh dear,[ANGRY],"[ANGRY, DISLIKE, OTHER]"
7629,@LllDev ماله داعي تشهير شي بيني وبينك😤,"[DISLIKE, PESSIMISTIC, ANGRY]",[WORRIED]


In [16]:
df_test.to_csv('preds_baseline_ait_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function basic experiment

In [17]:
df["functions"] = df["functions"].apply(lambda x: [x])

In [18]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": ait_dataset['test']
}
    )
dataset['train'] = dataset['train'].map(one_hot_encoding, batched=True)
dataset

100%|██████████| 1/1 [00:00<00:00, 291.86ba/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'functions', 'label_ids'],
        num_rows: 410
    })
    test: Dataset({
        features: ['text', 'OTHER', 'DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY', 'functions', 'label_ids'],
        num_rows: 7631
    })
})

In [19]:
y_train = np.array(dataset['train']['label_ids'])
y_test = np.array(dataset['test']['label_ids'])

In [20]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset['train']['text'])
X_test_counts = count_vect.transform(dataset['test']['text'])

In [21]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [22]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.44      0.07      0.11      2629
     DISLIKE       0.35      0.03      0.06      1838
       OTHER       0.63      0.25      0.36      3936
 PESSIMISTIC       0.31      0.05      0.08      2691
     WORRIED       0.24      0.14      0.17      1078

   micro avg       0.47      0.12      0.19     12172
   macro avg       0.39      0.11      0.16     12172
weighted avg       0.44      0.12      0.18     12172
 samples avg       0.17      0.14      0.14     12172



In [23]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_basic_ait_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [25]:
df_test = ait_dataset['test'].to_pandas()
cols = ["text", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

Unnamed: 0,text,functions,predicted_labels
0,@kenjigi Jajaja ay soy iop vv :3,[OTHER],[]
1,Me esta doliendo la cabeza 🙁,"[PESSIMISTIC, ANGRY]",[]
2,٢٩ | رمضان 🌙 اللهم أجعل رمضان هذا العام خاتمه ...,[OTHER],[]
3,Remember what God says about you is most impor...,[OTHER],[OTHER]
4,Usually love @KatiePrice but watching #mycrazy...,"[DISLIKE, PESSIMISTIC, ANGRY]","[ANGRY, OTHER]"
...,...,...,...
7626,@AquaticQueenM -He laughs back- 'I may act all...,"[OTHER, PESSIMISTIC]",[]
7627,My arms and legs abs tummy and back and face a...,"[PESSIMISTIC, ANGRY]",[OTHER]
7628,#muggymike #revenge oh dear,[ANGRY],[]
7629,@LllDev ماله داعي تشهير شي بيني وبينك😤,"[DISLIKE, PESSIMISTIC, ANGRY]",[]


In [26]:
df_test.to_csv('preds_functions_basic_ait_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function extended experiment

In [27]:
comfunctions_extended = 'comfunct_extended.txt'
df = pd.read_csv(comfunctions_extended, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'WORRIED', 'PESSIMISTIC', 'DISLIKE', 'OTHER'])].rename(columns={"function": "functions"})
df

Unnamed: 0,text,functions
0,"Hey #friends, just wanted to clarify that the ...",OTHER
1,Good morning everyone! ☀️ Just wanted to say a...,OTHER
2,"Hey #team, don't forget our meeting at 2pm tod...",OTHER
3,"Sorry for the late reply, I was swamped at wor...",OTHER
4,"Hey, can someone introduce me to @jane_doe? I'...",OTHER
...,...,...
2106,i hate this,DISLIKE
2107,this is the worst,DISLIKE
2108,"Ew, spinach in my salad again. 😒 #NotAVegHead",DISLIKE
2109,🤢 Don't like this new food trend 🍔 #StickToThe...,DISLIKE


In [28]:
df["functions"] = df["functions"].apply(lambda x: [x])

In [29]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": ait_dataset['test']
}
    )
dataset['train'] = dataset['train'].map(one_hot_encoding, batched=True)
dataset

100%|██████████| 1/1 [00:00<00:00, 203.69ba/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'functions', 'label_ids'],
        num_rows: 733
    })
    test: Dataset({
        features: ['text', 'OTHER', 'DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY', 'functions', 'label_ids'],
        num_rows: 7631
    })
})

In [30]:
y_train = np.array(dataset['train']['label_ids'])
y_test = np.array(dataset['test']['label_ids'])

In [31]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset['train']['text'])
X_test_counts = count_vect.transform(dataset['test']['text'])

In [32]:
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb, n_jobs=-1)
multi_nb.fit(X_train_counts, y_train)
y_pred_test = multi_nb.predict(X_test_counts)

In [33]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.66      0.05      0.09      2629
     DISLIKE       0.43      0.03      0.06      1838
       OTHER       0.69      0.28      0.39      3936
 PESSIMISTIC       0.40      0.02      0.03      2691
     WORRIED       0.39      0.06      0.10      1078

   micro avg       0.63      0.11      0.19     12172
   macro avg       0.51      0.09      0.14     12172
weighted avg       0.55      0.11      0.17     12172
 samples avg       0.17      0.14      0.15     12172



In [34]:
clf_report = classification_report(y_test, y_pred_test, target_names=labels_classes, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_extended_ait_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [None]:
df_test = go_emotions['test'].to_pandas()
cols = ["text", "functions"]
df_test = df_test[:][cols]
df_test['predicted_labels'] =  list(map(list, mlb.inverse_transform(y_pred_test)))
df_test

In [None]:
df_test.to_csv('preds_functions_extended_go_emotions_naive_bayes.tsv',header =True, sep = '\t',index=False)