In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Is gpu available?', device=='cuda')
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#from transformers import pipeline
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

Is gpu available? True


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
emo2idx = {}
idx2emo = {}
i = 0
with open('./data/emotions.txt','r') as f:
    for line in f:
        emo2idx[line.strip()]=i
        idx2emo[i] = line.strip()
        i+=1
emo_list = list(emo2idx.keys())
emo_list.remove('neutral')
idx2emo_no_neutral = idx2emo.copy()
emo2idx_no_neutral = emo2idx.copy()
del idx2emo_no_neutral[27]
del emo2idx_no_neutral['neutral']
#emo_list
from datasets import load_dataset
dataset = load_dataset("csv",data_files=
                       {"train": "data/train.csv",
                        "validation": "data/val.csv",
                         "test": "data/test.csv"})

Found cached dataset csv (C:/Users/lkkcp/.cache/huggingface/datasets/csv/default-f4006db2e00e5a81/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 117.36it/s]


In [8]:
model_checkpoint = "roberta-base"
MAXLEN = 128
batch_size = 8
metric_name = "f1"
epochs = 24
weight_decay = 0.01
learning_rate = 3e-6   #Default is 2e-5

    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def encode(samples):
    return tokenizer(samples['Text'], padding="max_length", truncation=True, max_length=MAXLEN)
def label_formatting(samples):
    labels_matrix = np.zeros((len(samples['Text']), len(emo_list)))
    for i, emo in enumerate(emo_list):
        labels_matrix[:,i] = samples[emo]
    return {'labels':labels_matrix.tolist()}

temp = dataset.map(encode,batched=True)
encoded_dataset = temp.map(label_formatting,batched=True,remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                       problem_type="multi_label_classification", 
                                                       num_labels=len(emo_list),
                                                       id2label=idx2emo_no_neutral,
                                                       label2id=emo2idx_no_neutral).to(device)


args = TrainingArguments(
    f"{model_checkpoint}-lr-{learning_rate}-goemotions",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
print("learning_rate is",learning_rate)
trainer.evaluate()


Loading cached processed dataset at C:\Users\lkkcp\.cache\huggingface\datasets\csv\default-f4006db2e00e5a81\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-d529e26a8d9f73dc.arrow
Loading cached processed dataset at C:\Users\lkkcp\.cache\huggingface\datasets\csv\default-f4006db2e00e5a81\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c38101d62c8e147b.arrow
Loading cached processed dataset at C:\Users\lkkcp\.cache\huggingface\datasets\csv\default-f4006db2e00e5a81\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24f67bfa2d75590c.arrow
Loading cached processed dataset at C:\Users\lkkcp\.cache\huggingface\datasets\csv\default-f4006db2e00e5a81\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0ec25b68e32c1e95.arrow
Loading cached processed dataset at C:\Users\lkkcp\.cache\huggingface\datasets\csv\default-f4006db2e00e5a81\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1308,0.122247,0.393527,0.627709,0.245902
2,0.1069,0.102252,0.513055,0.687216,0.363115
3,0.0965,0.094017,0.569759,0.722415,0.43224
4,0.0908,0.090617,0.604757,0.751123,0.476776
5,0.0859,0.088692,0.615364,0.760281,0.492896
6,0.0811,0.087812,0.616927,0.762605,0.498361
7,0.0756,0.087765,0.622625,0.770256,0.503005
8,0.0751,0.086959,0.625871,0.773406,0.509016
9,0.0734,0.087044,0.619029,0.768904,0.501639
10,0.0685,0.088853,0.623399,0.779408,0.505191


learning_rate is 3e-06


{'eval_loss': 0.08985493332147598,
 'eval_f1': 0.6279041180086048,
 'eval_roc_auc': 0.7820451901506785,
 'eval_accuracy': 0.5103825136612021,
 'eval_runtime': 33.4713,
 'eval_samples_per_second': 109.348,
 'eval_steps_per_second': 13.683,
 'epoch': 24.0}

In [7]:
####  Inference  
text = "Oh yah? Fuck you!"
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [idx2emo[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['anger']
