# Training our deep learning model

Code taken from   
https://huggingface.co/transformers/custom_datasets.html  
https://towardsdatascience.com/training-a-multi-label-emotion-classifier-with-tez-and-pytorch-af04c899a63a
  - Not sure how much of that code is actually left, but we started with it so we mention it here

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, DistilBertForSequenceClassification, AdamW, AutoTokenizer, PretrainedConfig, file_utils, TextClassificationPipeline, Pipeline
from torch.utils.data import DataLoader
import torch
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from source.emotion import all_emotions
from source.classification_utils import MultiLabelTextClassification, analyze_result

# Initialization and dataset preparation

In [None]:
model_describ = "distilbert-base-cased"
output_dir = './results/models'
best_model_path = output_dir + "/best"

In [None]:
data = load_dataset("go_emotions", "simplified")

train_raw = data['train']
val_raw = data['validation']

train_df = train_raw.to_pandas()
val_df = val_raw.to_pandas()
test_df = data['test'].to_pandas()

In [None]:
def one_hot_labels(df, n_labels):
    one_hot = np.zeros((len(df), n_labels), dtype=np.int)
    for i, row in enumerate(df["labels"].iteritems()):
        one_hot[i, row[1]] = 1
    return one_hot

n_labels = 28

train_oh_labels = one_hot_labels(train_df, n_labels)
val_oh_labels = one_hot_labels(val_df, n_labels)
test_oh_labels = one_hot_labels(test_df, n_labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_describ)
# we can still pass max length here
train_encodings = tokenizer(train_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(test_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [None]:
class EmotionsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionsDataset(train_encodings, train_oh_labels)
val_dataset = EmotionsDataset(val_encodings, val_oh_labels)
test_dataset = EmotionsDataset(test_encodings, test_oh_labels)

In [None]:
#https://huggingface.co/transformers/main_classes/trainer.html
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
labels = all_emotions
id2label = {i:label for i,label in enumerate(labels)}

# Training

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_total_limit=10,
    evaluation_strategy="steps",
    load_best_model_at_end=True,     # Defaults to loss as criterion

)
model = DistilBertForSequenceClassification.from_pretrained(model_describ, num_labels = n_labels, id2label=id2label)

trainer = MultilabelTrainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)

In [None]:
trainer.train()
trainer.save_model(best_model_path)

# Analyze the model

In [None]:
# load best model
model2 = DistilBertForSequenceClassification.from_pretrained(best_model_path)

In [None]:
prediction_trainer = MultilabelTrainer(
    model=model2,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
)

In [None]:
output = prediction_trainer.predict(val_dataset)

In [None]:
import matplotlib.pyplot as plt

In [None]:
y_pred = output.predictions

In [None]:
y_pred = np.exp(y_pred) / (1+np.exp(y_pred))
y_true = output.label_ids

In [None]:
np.mean((y_pred - y_true)**2)

In [None]:
predicted_emotions = {}
for p in main_characters:
    predicted_emotions[p] = list()

for row in tqdm(dataset.itertuples(), total=len(dataset)):
    #prediction = zero_shot_pipeline(row.line, labels, multi_label=True)
    #if len(row.line) == 1:
    #    prediction = [prediction]
    #prediction = [[{'label' : label, 'score': value} for label, value in zip(sentence['labels'], sentence['scores'])] for sentence in prediction]
    #result = analyze_result(prediction, .8)
    prediction = inference_pipeline(row.line)
    result = analyze_result(prediction, .2)
    
    
    result = [(pred['label'], pred['score']) for pred in result[0]] 
    predicted_emotions[row.person].append(result)