In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_ckpt = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

def mean_pooling(model_output, attention_mask):   
    token_embeddings = model_output[0] 
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float()) 
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) 
    return sum_embeddings / sum_mask

def embed_text(examples):
    inputs = tokenizer(examples["text"], padding=True, truncation=True,
                       return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

from sentence_transformers import util
def cosine_sim(batch):
    batch['predicted_label'], batch['score'] = [[] for _ in range(len(batch['text']))], [[] for _ in range(len(batch['text']))]
    for i, emb in enumerate(batch['embedding']):
        scores = {func: util.pytorch_cos_sim(emb, metaemb[func]).item() for func in metaemb}
        predicted_label, score = max(scores.items(), key=lambda x: x[1])
        batch['predicted_label'][i] = predicted_label
        batch['score'][i] = score
    return batch

In [3]:
emoevent_es_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'es_train.tsv',
                                                        'test': r'es_test.tsv',
                                                       'valid': r'es_dev.tsv'})
emoevent_en_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'en_train.tsv',
                                                        'test': r'en_test.tsv',
                                                       'valid': r'en_dev.tsv'})

train_dataset = concatenate_datasets([emoevent_es_dataset['train'], emoevent_en_dataset['train']])
valid_dataset = concatenate_datasets([emoevent_es_dataset['valid'], emoevent_en_dataset['valid']])
test_dataset = concatenate_datasets([emoevent_es_dataset['test'], emoevent_en_dataset['test']])

emoevent = DatasetDict({'train': train_dataset.shuffle(seed=42), 'valid': valid_dataset.shuffle(seed=42), 'test': test_dataset.shuffle(seed=42)})

Using custom data configuration default-960b1f9335da6974
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 623.16it/s]
Using custom data configuration default-f0f3a67dac6080b1
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-f0f3a67dac6080b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 941.34it/s]
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6a6e13a3ae6ef6af.arrow
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b12a5b2844f1fc9b.arrow
Lo

In [4]:
function_mapping = {'OTHER': ['others', 'surprise', 'joy'],
           'NOT_INTERESTED': [''], 
           'DISLIKE':['disgust'], 
           'NOT_CORRECT': [''], 
           'PESSIMISTIC':['sadness'], 
           'WORRIED':['fear'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': [''], 
           'BORED': [''], 
           'NOT_APPROVE':[''], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': [''], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

emoevent = emoevent.remove_columns("id").remove_columns("event").remove_columns("offensive").rename_column('tweet', 'text')
def map_labels(batch):
    batch['function'] = [[] for _ in range(len(batch['text']))]
    for i, item in enumerate(batch["function"]):
        for key, value in function_mapping.items():
            for emotion in value:
                if emotion == batch["emotion"][i]:
                    batch["function"][i] = key
                    break
    return batch
emoevent = emoevent.map(map_labels, batched=True)

Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e14b2b051f7c3e62.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-bf961779de707358.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-c4e34589835c431b.arrow


# Function basic experiment

In [5]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'DISLIKE', 'OTHER', 'PESSIMISTIC', 'WORRIED'])]

df

Unnamed: 0,text,function
85,I'm not really very keen on it,DISLIKE
86,I'm not really very keen,DISLIKE
87,I'm not really keen on it,DISLIKE
88,I'm not really keen,DISLIKE
89,I'm not very keen on it,DISLIKE
...,...,...
897,you could do it,OTHER
898,you look nice,OTHER
899,you look smart,OTHER
900,you're right,OTHER


In [6]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": emoevent['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'function'],
        num_rows: 410
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 3073
    })
})

In [7]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 26/26 [00:01<00:00, 16.00ba/s]
100%|██████████| 193/193 [00:41<00:00,  4.62ba/s]


In [8]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

dict_keys(['DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY', 'OTHER'])


In [9]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

100%|██████████| 193/193 [00:01<00:00, 97.83ba/s]


In [10]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.13      0.33      0.19       244
     DISLIKE       0.30      0.03      0.06       184
       OTHER       0.81      0.69      0.74      2316
 PESSIMISTIC       0.09      0.13      0.10       278
     WORRIED       0.13      0.22      0.16        51

    accuracy                           0.56      3073
   macro avg       0.29      0.28      0.25      3073
weighted avg       0.65      0.56      0.59      3073



In [11]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_basic_emoevent_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [12]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

Unnamed: 0,text,emotion,function,predicted_label,score
0,"A su corta edad, #GretaThunberg realiza una hu...",sadness,PESSIMISTIC,OTHER,0.235700
1,Here’s to all the badass bitches in Westeros #...,joy,OTHER,OTHER,0.387474
2,¿PARA QUÉ LES SEGUÍS CON LOS DRAGONES? ESO ES ...,surprise,OTHER,OTHER,0.452714
3,The book collection campaign was originated by...,others,OTHER,OTHER,0.151092
4,USER You know who’s happy about the social and...,disgust,DISLIKE,OTHER,0.355766
...,...,...,...,...,...
3068,#PedroPresidente #ElecccionesGenerales28A #pso...,others,OTHER,OTHER,0.396823
3069,"Ajá, ¿Dónde están los que decían que Guaidó US...",others,OTHER,OTHER,0.359550
3070,THANOS (with the İnfinity Gauntlet) He missed...,others,OTHER,PESSIMISTIC,0.217733
3071,Greetings to all my #bookworms friends on #Wor...,joy,OTHER,OTHER,0.337349


In [13]:
df_test.to_csv('preds_functions_basic_emoevent_metaembedding.tsv',header =True, sep = '\t',index=False)

# Function extended experiment

In [14]:
comfunctions_extended = 'comfunct_extended.txt'
df = pd.read_csv(comfunctions_extended, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'DISLIKE', 'OTHER', 'PESSIMISTIC', 'WORRIED'])]

df

Unnamed: 0,text,function
0,"Hey #friends, just wanted to clarify that the ...",OTHER
1,Good morning everyone! ☀️ Just wanted to say a...,OTHER
2,"Hey #team, don't forget our meeting at 2pm tod...",OTHER
3,"Sorry for the late reply, I was swamped at wor...",OTHER
4,"Hey, can someone introduce me to @jane_doe? I'...",OTHER
...,...,...
2106,i hate this,DISLIKE
2107,this is the worst,DISLIKE
2108,"Ew, spinach in my salad again. 😒 #NotAVegHead",DISLIKE
2109,🤢 Don't like this new food trend 🍔 #StickToThe...,DISLIKE


In [15]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": emoevent['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'function'],
        num_rows: 733
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 3073
    })
})

In [16]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 46/46 [00:04<00:00, 10.84ba/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9cd176735da0c6a9.arrow


In [17]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

dict_keys(['OTHER', 'ANGRY', 'WORRIED', 'PESSIMISTIC', 'DISLIKE'])


In [18]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

100%|██████████| 193/193 [00:01<00:00, 97.36ba/s]


In [19]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.14      0.37      0.21       244
     DISLIKE       0.15      0.20      0.17       184
       OTHER       0.82      0.56      0.67      2316
 PESSIMISTIC       0.10      0.10      0.10       278
     WORRIED       0.06      0.39      0.11        51

    accuracy                           0.48      3073
   macro avg       0.26      0.33      0.25      3073
weighted avg       0.65      0.48      0.54      3073



In [20]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_extended_emoevent_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [21]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

Unnamed: 0,text,emotion,function,predicted_label,score
0,"A su corta edad, #GretaThunberg realiza una hu...",sadness,PESSIMISTIC,WORRIED,0.328670
1,Here’s to all the badass bitches in Westeros #...,joy,OTHER,ANGRY,0.479460
2,¿PARA QUÉ LES SEGUÍS CON LOS DRAGONES? ESO ES ...,surprise,OTHER,ANGRY,0.581448
3,The book collection campaign was originated by...,others,OTHER,OTHER,0.145160
4,USER You know who’s happy about the social and...,disgust,DISLIKE,OTHER,0.439268
...,...,...,...,...,...
3068,#PedroPresidente #ElecccionesGenerales28A #pso...,others,OTHER,OTHER,0.489922
3069,"Ajá, ¿Dónde están los que decían que Guaidó US...",others,OTHER,ANGRY,0.421708
3070,THANOS (with the İnfinity Gauntlet) He missed...,others,OTHER,PESSIMISTIC,0.374856
3071,Greetings to all my #bookworms friends on #Wor...,joy,OTHER,OTHER,0.342155


In [22]:
df_test.to_csv('preds_functions_extended_emoevent_metaembedding.tsv',header =True, sep = '\t',index=False)

# EmoEvent es train (baseline) experiment

In [23]:
dataset = DatasetDict({
    "train": emoevent['train'],
     "test": emoevent['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 10835
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 3073
    })
})

In [24]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 678/678 [02:07<00:00,  5.32ba/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-960b1f9335da6974/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9cd176735da0c6a9.arrow


In [None]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

In [None]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

In [None]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

In [None]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_baseline_emoevent_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [None]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

In [None]:
df_test.to_csv('preds_baseline_emoevent_metaembedding.tsv',header =True, sep = '\t',index=False)