# EmoEvent en train (baseline) experiment

In [3]:
from datasets import Dataset, load_dataset, DatasetDict, ClassLabel, concatenate_datasets
import numpy as np
import torch
import pandas as pd

#Seeding for deterministic results i.e. showing same output 
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

In [4]:
emoevent_es_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'es_train.tsv',
                                                        'test': r'es_test.tsv',
                                                       'valid': r'es_dev.tsv'})
emoevent_en_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'en_train.tsv',
                                                        'test': r'en_test.tsv',
                                                       'valid': r'en_dev.tsv'})

train_dataset = concatenate_datasets([emoevent_es_dataset['train'], emoevent_en_dataset['train']])
valid_dataset = concatenate_datasets([emoevent_es_dataset['valid'], emoevent_en_dataset['valid']])
test_dataset = concatenate_datasets([emoevent_es_dataset['test'], emoevent_en_dataset['test']])

emoevent = DatasetDict({'train': train_dataset.shuffle(seed=42), 'valid': valid_dataset.shuffle(seed=42), 'test': test_dataset.shuffle(seed=42)})

Using custom data configuration default-960b1f9335da6974
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 877.35it/s]
Using custom data configuration default-f0f3a67dac6080b1
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-f0f3a67dac6080b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 920.68it/s]
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6a6e13a3ae6ef6af.arrow
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b12a5b2844f1fc9b.arrow
Lo

In [5]:
emoevent = emoevent.remove_columns("id").remove_columns("event").remove_columns("offensive")

function_mapping = {'OTHER': ['others', 'surprise', 'joy'],
           'NOT_INTERESTED': [''], 
           'DISLIKE':['disgust'], 
           'NOT_CORRECT': [''], 
           'PESSIMISTIC':['sadness'], 
           'WORRIED':['fear'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': [''], 
           'BORED': [''], 
           'NOT_APPROVE':[''], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': [''], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

def map_labels(batch):
    batch['function'] = [[] for _ in range(len(batch['tweet']))]
    for i, item in enumerate(batch["function"]):
        for key, value in function_mapping.items():
            for emotion in value:
                if emotion == batch["emotion"][i]:
                    batch["function"][i] = key
                    break
    return batch
emoevent = emoevent.map(map_labels, batched=True)

Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-583aa5492a81268f.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4054e870b3bdf90d.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-cc4651795e9bd861.arrow


In [6]:
df_train = emoevent['train'].to_pandas()
df_valid = emoevent['valid'].to_pandas()
df_test = emoevent['test'].to_pandas()

df_train

Unnamed: 0,tweet,emotion,function
0,Pues me sumo a los elogios del CAPITULACO de #...,joy,OTHER
1,Sigo teniendo esperanzas de que sea Gilly. ¿Qu...,others,OTHER
2,Following philanthropic response to the #Notre...,others,OTHER
3,"#NotiPost #30Abr #Venezuela vive un ""momento c...",others,OTHER
4,I can't help but to feel a little disappointed...,others,OTHER
...,...,...,...
10830,"📚 ""Siempre imaginé que el paraíso sería algún ...",others,OTHER
10831,Puta mierda el 8x03 de Juego de tronos. Como c...,anger,ANGRY
10832,""" Qué tristeza, ..., se pierde una creación de...",sadness,PESSIMISTIC
10833,"""You need to vote for us, for your children an...",others,OTHER


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [8]:
X_train = cv.fit_transform(df_train['tweet'])
X_valid = cv.transform(df_valid['tweet'])
X_test = cv.transform(df_test['tweet'])

y_train = df_train["function"]
y_valid = df_valid["function"]
y_test = df_test["function"]

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred_test = classifier.predict(X_test)
clf_report = classification_report(y_test, y_pred_test, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.14      0.04      0.06       244
     DISLIKE       0.36      0.18      0.24       184
       OTHER       0.77      0.90      0.83      2316
 PESSIMISTIC       0.14      0.10      0.12       278
     WORRIED       0.00      0.00      0.00        51

    accuracy                           0.70      3073
   macro avg       0.28      0.24      0.25      3073
weighted avg       0.62      0.70      0.66      3073



In [10]:
clf_report = classification_report(y_test, y_pred_test, zero_division=0, output_dict=True)
df_results = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_baseline_emoevent_naive_bayes.csv', 'w') as csv_file:
    df_results.to_csv(path_or_buf=csv_file)

In [11]:
df_test['predicted_label'] = y_pred_test
df_test.to_csv('preds_baseline_emoevent_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function basic experiment

In [12]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'DISLIKE', 'OTHER', 'PESSIMISTIC', 'WORRIED'])]

df

Unnamed: 0,text,function
85,I'm not really very keen on it,DISLIKE
86,I'm not really very keen,DISLIKE
87,I'm not really keen on it,DISLIKE
88,I'm not really keen,DISLIKE
89,I'm not very keen on it,DISLIKE
...,...,...
897,you could do it,OTHER
898,you look nice,OTHER
899,you look smart,OTHER
900,you're right,OTHER


In [10]:
cv = CountVectorizer()
X_train = cv.fit_transform(df['text'])
X_test = cv.transform(df_test['tweet'])

y_train = df["function"]
y_test = df_test["function"]

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred_test = classifier.predict(X_test)
clf_report = classification_report(y_test, y_pred_test, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.03      0.05      0.04        78
     DISLIKE       0.10      0.07      0.08       151
       OTHER       0.76      0.48      0.59      1105
 PESSIMISTIC       0.04      0.07      0.05        83
     WORRIED       0.02      0.20      0.03        30

    accuracy                           0.38      1447
   macro avg       0.19      0.17      0.16      1447
weighted avg       0.60      0.38      0.46      1447



In [12]:
clf_report = classification_report(y_test, y_pred_test, zero_division=0, output_dict=True)
df_result = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_basic_emoevent_en_naive_bayes.csv', 'w') as csv_file:
    df_result.to_csv(path_or_buf=csv_file)

In [13]:
df_test['predicted_label'] = y_pred_test
df_test.to_csv('preds_functions_basic_emoevent_en_naive_bayes.tsv',header =True, sep = '\t',index=False)

# Function extended experiment

In [14]:
comfunctions_extended = 'comfunct_extended.txt'
df = pd.read_csv(comfunctions_extended, delimiter=";")
df = df[~df['function'].isin(['NOT_INTERESTED', 'BORED', 'NOT_CORRECT', 'DISAPPOINTED', 'COMPLAIN', 'NOT_APPROVE',
                             'NOT_IMPORTANT', 'DISAGREE', 'REFUSE', 'WARN', 'THREATEN', 'UNWILLING', 'DISTRUST' ])]

df

Unnamed: 0,text,function
0,"Hey #friends, just wanted to clarify that the ...",OTHER
1,Good morning everyone! ☀️ Just wanted to say a...,OTHER
2,"Hey #team, don't forget our meeting at 2pm tod...",OTHER
3,"Sorry for the late reply, I was swamped at wor...",OTHER
4,"Hey, can someone introduce me to @jane_doe? I'...",OTHER
...,...,...
2106,i hate this,DISLIKE
2107,this is the worst,DISLIKE
2108,"Ew, spinach in my salad again. 😒 #NotAVegHead",DISLIKE
2109,🤢 Don't like this new food trend 🍔 #StickToThe...,DISLIKE


In [15]:
cv = CountVectorizer()
X_train = cv.fit_transform(df['text'])
X_test = cv.transform(df_test['tweet'])

y_train = df["function"]
y_test = df_test["function"]

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred_test = classifier.predict(X_test)
clf_report = classification_report(y_test, y_pred_test, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.07      0.22      0.11        78
     DISLIKE       0.15      0.17      0.16       151
       OTHER       0.81      0.57      0.67      1105
 PESSIMISTIC       0.05      0.05      0.05        83
     WORRIED       0.04      0.23      0.07        30

    accuracy                           0.47      1447
   macro avg       0.22      0.25      0.21      1447
weighted avg       0.64      0.47      0.54      1447



In [17]:
clf_report = classification_report(y_test, y_pred_test, zero_division=0, output_dict=True)
df_result = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_extended_emoevent_en_naive_bayes.csv', 'w') as csv_file:
    df_result.to_csv(path_or_buf=csv_file)

In [18]:
df_test['predicted_label'] = y_pred_test
df_test.to_csv('preds_functions_extended_emoevent_en_naive_bayes.tsv',header =True, sep = '\t',index=False)