In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_ckpt = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

def mean_pooling(model_output, attention_mask):   
    token_embeddings = model_output[0] 
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float()) 
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) 
    return sum_embeddings / sum_mask

def embed_text(examples):
    inputs = tokenizer(examples["text"], padding=True, truncation=True,
                       return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

from sentence_transformers import util
def cosine_sim(batch):
    batch['predicted_label'], batch['score'] = [[] for _ in range(len(batch['text']))], [[] for _ in range(len(batch['text']))]
    for i, emb in enumerate(batch['embedding']):
        scores = {func: util.pytorch_cos_sim(emb, metaemb[func]).item() for func in metaemb}
        predicted_label, score = max(scores.items(), key=lambda x: x[1])
        batch['predicted_label'][i] = predicted_label
        batch['score'][i] = score
    return batch

In [3]:
function_mapping = {'OTHER': ['joy', 'love', 'surprise'],
           'NOT_INTERESTED': [''], 
           'DISLIKE':[''], 
           'NOT_CORRECT': [''], 
           'PESSIMISTIC':['sadness'], 
           'WORRIED':['fear'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': [''], 
           'BORED': [''], 
           'NOT_APPROVE':[''], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': [''], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

emotion = load_dataset('emotion')
mapping = {
    0:"sadness",
    1:"joy",
    2:"love",
    3:"anger",
    4:"fear",
    5:"surprise"
}
emotion = emotion.map(lambda example: {'emotion': mapping[example['label']]}, 
                      remove_columns=['label'])
def map_labels(batch):
    batch['function'] = [[] for _ in range(len(batch['text']))]
    for i, item in enumerate(batch["function"]):
        for key, value in function_mapping.items():
            for emotion in value:
                if emotion == batch["emotion"][i]:
                    batch["function"][i] = key
                    break
    return batch
emotion = emotion.map(map_labels, batched=True) 

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
100%|██████████| 3/3 [00:00<00:00, 542.23it/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-da33b75eb48814c5.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-c9fa37d993431ea5.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-9e625d0ab4c5791f.arrow
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c670972

# Function basic experiment

In [4]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[~df['function'].isin(['NOT_INTERESTED', 'BORED', 'DISLIKE', 'NOT_CORRECT', 'DISAPPOINTED', 'COMPLAIN', 'NOT_APPROVE',
                             'NOT_IMPORTANT', 'DISAGREE', 'REFUSE', 'WARN', 'THREATEN', 'UNWILLING', 'DISTRUST' ])]

df

Unnamed: 0,text,function
209,I'm fairly pessimistic about it,PESSIMISTIC
210,I'm fairly pessimistic,PESSIMISTIC
211,I'm pessimistic about it,PESSIMISTIC
212,I'm pessimistic,PESSIMISTIC
213,I'm rather sceptical about it,PESSIMISTIC
...,...,...
897,you could do it,OTHER
898,you look nice,OTHER
899,you look smart,OTHER
900,you're right,OTHER


In [5]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": emotion['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'function'],
        num_rows: 360
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 2000
    })
})

In [6]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 23/23 [00:01<00:00, 19.95ba/s]
100%|██████████| 125/125 [00:16<00:00,  7.69ba/s]


In [7]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

dict_keys(['PESSIMISTIC', 'WORRIED', 'ANGRY', 'OTHER'])


In [8]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

100%|██████████| 125/125 [00:01<00:00, 98.13ba/s]


In [9]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.32      0.55      0.40       275
       OTHER       0.81      0.69      0.75       920
 PESSIMISTIC       0.43      0.14      0.22       581
     WORRIED       0.30      0.71      0.42       224

    accuracy                           0.52      2000
   macro avg       0.46      0.52      0.45      2000
weighted avg       0.57      0.52      0.51      2000



In [10]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_basic_emotion_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [11]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

Unnamed: 0,text,emotion,function,predicted_label,score
0,im feeling rather rotten so im not very ambiti...,sadness,PESSIMISTIC,PESSIMISTIC,0.380152
1,im updating my blog because i feel shitty,sadness,PESSIMISTIC,ANGRY,0.428747
2,i never make her separate from me because i do...,sadness,PESSIMISTIC,WORRIED,0.215277
3,i left with my bouquet of red and yellow tulip...,joy,OTHER,OTHER,0.333295
4,i was feeling a little vain when i did this one,sadness,PESSIMISTIC,OTHER,0.457706
...,...,...,...,...,...
1995,i just keep feeling like someone is being unki...,anger,ANGRY,ANGRY,0.357083
1996,im feeling a little cranky negative after this...,anger,ANGRY,WORRIED,0.494988
1997,i feel that i am useful to my people and that ...,joy,OTHER,OTHER,0.358507
1998,im feeling more comfortable with derby i feel ...,joy,OTHER,OTHER,0.369901


In [12]:
df_test.to_csv('preds_functions_basic_emoevent_en_metaembedding.tsv',header =True, sep = '\t',index=False)

# Function extended experiment

In [15]:
comfunctions_extended = 'comfunct_extended.txt'
df = pd.read_csv(comfunctions_extended, delimiter=";")
df = df[~df['function'].isin(['NOT_INTERESTED', 'BORED', 'DISLIKE', 'NOT_CORRECT', 'DISAPPOINTED', 'COMPLAIN', 'NOT_APPROVE',
                             'NOT_IMPORTANT', 'DISAGREE', 'REFUSE', 'WARN', 'THREATEN', 'UNWILLING', 'DISTRUST' ])]

df

Unnamed: 0,text,function
0,"Hey #friends, just wanted to clarify that the ...",OTHER
1,Good morning everyone! ☀️ Just wanted to say a...,OTHER
2,"Hey #team, don't forget our meeting at 2pm tod...",OTHER
3,"Sorry for the late reply, I was swamped at wor...",OTHER
4,"Hey, can someone introduce me to @jane_doe? I'...",OTHER
...,...,...
1805,just waiting for the other shoe to drop,PESSIMISTIC
1806,I don't think this is going to go well #notloo...,PESSIMISTIC
1807,I have a feeling things are going to turn out ...,PESSIMISTIC
1808,I don't have high hopes for this #dreadingit,PESSIMISTIC


In [16]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)),
     "test": emotion['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'function'],
        num_rows: 584
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 2000
    })
})

In [17]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 37/37 [00:03<00:00, 10.71ba/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-331e76243cfd2cfd.arrow


In [18]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

dict_keys(['OTHER', 'ANGRY', 'WORRIED', 'PESSIMISTIC'])


In [19]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

100%|██████████| 125/125 [00:01<00:00, 111.40ba/s]


In [20]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.43      0.40      0.42       275
       OTHER       0.82      0.59      0.69       920
 PESSIMISTIC       0.49      0.60      0.54       581
     WORRIED       0.37      0.60      0.46       224

    accuracy                           0.57      2000
   macro avg       0.53      0.55      0.53      2000
weighted avg       0.62      0.57      0.58      2000



In [21]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_functions_extended_emotion_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [22]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

Unnamed: 0,text,emotion,function,predicted_label,score
0,im feeling rather rotten so im not very ambiti...,sadness,PESSIMISTIC,PESSIMISTIC,0.590251
1,im updating my blog because i feel shitty,sadness,PESSIMISTIC,PESSIMISTIC,0.500468
2,i never make her separate from me because i do...,sadness,PESSIMISTIC,ANGRY,0.245638
3,i left with my bouquet of red and yellow tulip...,joy,OTHER,OTHER,0.391345
4,i was feeling a little vain when i did this one,sadness,PESSIMISTIC,PESSIMISTIC,0.410864
...,...,...,...,...,...
1995,i just keep feeling like someone is being unki...,anger,ANGRY,ANGRY,0.421779
1996,im feeling a little cranky negative after this...,anger,ANGRY,WORRIED,0.519338
1997,i feel that i am useful to my people and that ...,joy,OTHER,OTHER,0.390328
1998,im feeling more comfortable with derby i feel ...,joy,OTHER,PESSIMISTIC,0.382317


In [23]:
df_test.to_csv('preds_functions_extended_emotion_metaembedding.tsv',header =True, sep = '\t',index=False)

# EmoEvent es train (baseline) experiment

In [24]:
dataset = DatasetDict({
    "train": emotion['train'],
     "test": emotion['test']
}
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'emotion', 'function'],
        num_rows: 2000
    })
})

In [25]:
dataset = dataset.map(embed_text, batched=True, batch_size=16)

100%|██████████| 1000/1000 [02:02<00:00,  8.16ba/s]
Loading cached processed dataset at /home/njfernandez/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-331e76243cfd2cfd.arrow


In [None]:
from collections import defaultdict

metaemb = defaultdict(list)
ds = dataset['train']

for i, item in enumerate(ds["function"]):    
    function = ds['function'][i]
    emb = torch.FloatTensor(ds['embedding'][i])
    metaemb[function].append(emb)

for key, value in metaemb.items():
    metaemb[key] = torch.stack(value).mean(dim=0)

print(metaemb.keys())

In [None]:
dataset['test'] = dataset['test'].map(cosine_sim, batched=True, batch_size=16)

In [34]:
from sklearn.metrics import classification_report
y_true = dataset['test']["function"]
y_pred = dataset['test']["predicted_label"]
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
        )
print(clf_report)

              precision    recall  f1-score   support

       ANGRY       0.45      0.57      0.50       275
       OTHER       0.84      0.73      0.78       920
 PESSIMISTIC       0.66      0.59      0.62       581
     WORRIED       0.47      0.70      0.56       224

    accuracy                           0.67      2000
   macro avg       0.61      0.65      0.62      2000
weighted avg       0.69      0.67      0.67      2000



In [None]:
clf_report = classification_report(
        y_true, y_pred, zero_division=0, 
    output_dict=True
        )
df = pd.DataFrame(clf_report).transpose()
with open(r'classification_report_baseline_emotion_metaembedding.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [None]:
cols = ["text", "emotion", "function", "predicted_label", "score"]
df_test = dataset['test'].to_pandas()[:][cols]
df_test

In [None]:
df_test.to_csv('preds_baseline_emotion_metaembedding.tsv',header =True, sep = '\t',index=False)