In [69]:
from transformers import pipeline
import pandas as pd
import os
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
device = 0 if torch.cuda.is_available() else -1


In [2]:
# Browse for models from https://huggingface.co/models
# Make sure that the models are trained on the GoEmotions dataset. 
# In particular it must have the 27 + neutral emotion labels

# The model chosen here is https://huggingface.co/SamLowe/roberta-base-go_emotions
model_checkpoint = 'SamLowe/roberta-base-go_emotions'

In [25]:
classifier = pipeline("sentiment-analysis", model=model_checkpoint, device=device, return_all_scores=True)



In [34]:
classifier("A little fall of rain can hardly hurt me now.")

[[{'label': 'admiration', 'score': 0.0021952707320451736},
  {'label': 'amusement', 'score': 0.0008238746668212116},
  {'label': 'anger', 'score': 0.002433754736557603},
  {'label': 'annoyance', 'score': 0.02740986831486225},
  {'label': 'approval', 'score': 0.09228266775608063},
  {'label': 'caring', 'score': 0.029545331373810768},
  {'label': 'confusion', 'score': 0.0007856108713895082},
  {'label': 'curiosity', 'score': 0.00037318695103749633},
  {'label': 'desire', 'score': 0.002707253908738494},
  {'label': 'disappointment', 'score': 0.024413729086518288},
  {'label': 'disapproval', 'score': 0.026415714994072914},
  {'label': 'disgust', 'score': 0.001952703925780952},
  {'label': 'embarrassment', 'score': 0.000986917526461184},
  {'label': 'excitement', 'score': 0.0004959406796842813},
  {'label': 'fear', 'score': 0.00139901926741004},
  {'label': 'gratitude', 'score': 0.0007475910824723542},
  {'label': 'grief', 'score': 0.0012289411388337612},
  {'label': 'joy', 'score': 0.00530

In [6]:
df_test = pd.read_csv('../reformat_data/test_wide.csv')
df_test.head()

Unnamed: 0,Text,Classes,ID,Labels,admiration,amusement,anger,annoyance,approval,caring,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,I’m really sorry about your situation :( Altho...,25,eecwqtt,['sadness'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,It's wonderful because it's awful. At not with.,0,ed5f85d,['admiration'],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3,['excitement'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1,['gratitude'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,They got bored from haunting earth for thousan...,27,eem5uti,['neutral'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [35]:
%%time
pred = classifier(df_test['Text'].tolist())

CPU times: total: 48.3 s
Wall time: 1min 32s


In [59]:
scores = [[pred[i][j]['score'] for j in range(28)] for i in range(len(pred))]

In [65]:
emos = [pred[0][j]['label'] for j in range(28)]
pred_df = pd.DataFrame(scores, columns = emos)

In [66]:
pred_df.head()

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0.018729,0.007128,0.008085,0.006822,0.033646,0.055766,0.010225,0.010658,0.011346,0.036921,...,0.360813,0.00514,0.015967,0.000952,0.013122,0.003356,0.678301,0.540618,0.002137,0.006323
1,0.660624,0.005426,0.011097,0.040811,0.03991,0.001627,0.004121,0.002642,0.002946,0.025365,...,0.022821,0.001048,0.003491,0.003282,0.007465,0.001523,0.001095,0.007357,0.003618,0.008789
2,0.136872,0.010797,0.001436,0.003379,0.09656,0.198882,0.008116,0.074455,0.024194,0.001129,...,0.005561,0.003278,0.549406,0.002892,0.003619,0.005229,0.001425,0.001705,0.006319,0.021589
3,0.010203,0.007955,0.001315,0.004182,0.012846,0.001286,0.00756,0.00284,0.000896,0.001781,...,0.001401,0.000515,0.00455,0.001014,0.04421,0.005105,0.002286,0.002527,0.012318,0.008356
4,0.002441,0.001946,0.0013,0.014648,0.026051,0.003529,0.001928,0.000889,0.001087,0.044983,...,0.000608,0.002773,0.003553,0.000604,0.042837,0.002923,0.001356,0.050852,0.000813,0.868578


In [75]:
def multi_label_metrics(predictions, y_true, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    binary_pred = (predictions >= threshold).astype(int)
    f1_micro_average = f1_score(y_true=y_true, y_pred=binary_pred, average='micro')
    roc_auc = roc_auc_score(y_true, binary_pred, average = 'micro')
    accuracy = accuracy_score(y_true, binary_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics
print('The micro average scores are')
print(multi_label_metrics(pred_df, df_test[emos]))

The micro average scores are
{'f1': 0.5855114407162884, 'roc_auc': 0.7506083600993346, 'accuracy': 0.47429519071310117}


In [76]:
def individual_label_metrics(predictions, y_true, label, threshold=0.5):
    binary_pred = (predictions[label] >= threshold).astype(int)
    y_true_one_label = y_true[label]
    f1 = f1_score(y_true=y_true_one_label, y_pred=binary_pred)
    roc_auc = roc_auc_score(y_true_one_label, binary_pred)
    accuracy = accuracy_score(y_true_one_label, binary_pred)
    # return as dictionary
    metrics = {'f1': f1,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics


individual_metrics = [individual_label_metrics(pred_df, df_test, emo, threshold=0.5) for emo in emos]

In [79]:
print('Compute the metric for individual emotion categories')
pd.DataFrame(individual_metrics, index = emos)

Compute the metric for individual emotion categories


Unnamed: 0,f1,roc_auc,accuracy
admiration,0.698869,0.8242,0.946011
amusement,0.828829,0.929699,0.982495
anger,0.479233,0.685569,0.969965
annoyance,0.238318,0.574107,0.93993
approval,0.40381,0.644299,0.942325
caring,0.372294,0.654252,0.973282
confusion,0.463158,0.709429,0.971808
curiosity,0.427966,0.669359,0.950249
desire,0.49635,0.702948,0.987286
disappointment,0.301508,0.597632,0.974387


The model does really well in identifying **gratitude, amusement, love**.

It cannot identify (with zero recall) **grief, relief, pride**.

The above three categories (plus nervousness) have very few samples to train on. Grief has <100 samples. 