# Training our deep learning model

Code taken from   
https://huggingface.co/transformers/custom_datasets.html  
https://towardsdatascience.com/training-a-multi-label-emotion-classifier-with-tez-and-pytorch-af04c899a63a
  - Not sure how much of that code is actually left, but we started with it so we mention it here

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, DistilBertForSequenceClassification, AdamW, AutoTokenizer, PretrainedConfig, file_utils, TextClassificationPipeline, Pipeline, pipeline
from torch.utils.data import DataLoader
import torch
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

from source.emotion import all_emotions
from source.classification_utils import MultiLabelTextClassification, analyze_result

# Initialization and dataset preparation

In [None]:
model_describ = "distilbert-base-cased"
output_dir = './results/models'
best_model_path = output_dir + "/best"

In [None]:
data = load_dataset("go_emotions", "simplified")

train_raw = data['train']
val_raw = data['validation']

train_df = train_raw.to_pandas()
val_df = val_raw.to_pandas()
test_df = data['test'].to_pandas()

In [None]:
def one_hot_labels(df, n_labels):
    one_hot = np.zeros((len(df), n_labels), dtype=np.int)
    for i, row in enumerate(df["labels"].iteritems()):
        one_hot[i, row[1]] = 1
    return one_hot

n_labels = 28

train_oh_labels = one_hot_labels(train_df, n_labels)
val_oh_labels = one_hot_labels(val_df, n_labels)
test_oh_labels = one_hot_labels(test_df, n_labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_describ)
# we can still pass max length here
train_encodings = tokenizer(train_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(test_df['text'].values.tolist(), padding=True, truncation=True)#, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [None]:
class EmotionsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionsDataset(train_encodings, train_oh_labels)
val_dataset = EmotionsDataset(val_encodings, val_oh_labels)
test_dataset = EmotionsDataset(test_encodings, test_oh_labels)

In [None]:
#https://huggingface.co/transformers/main_classes/trainer.html
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
labels = all_emotions
id2label = {i:label for i,label in enumerate(labels)}

# Training

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_total_limit=10,
    evaluation_strategy="steps",
    load_best_model_at_end=True,     # Defaults to loss as criterion

)
model = DistilBertForSequenceClassification.from_pretrained(model_describ, num_labels = n_labels, id2label=id2label)

trainer = MultilabelTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)

In [None]:
trainer.train()
trainer.save_model(best_model_path)

# Analyze the model
Choose to analyze either the supervised training model from above or a zeroshot classifier

### Supervised classification model

In [None]:
# load best model
model2 = DistilBertForSequenceClassification.from_pretrained(best_model_path)
prediction_trainer = MultilabelTrainer(
    model=model2,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
)

In [None]:
output = prediction_trainer.predict(test_dataset)
y_true = np.copy(output.label_ids)
y_pred = np.copy(output.predictions)
y_pred = np.exp(y_pred) / (1+np.exp(y_pred))
prediction_threshold = 0.2

### Zero-shot model

In [None]:
zero_shot_classifier = "typeform/distilbert-base-uncased-mnli"
zero_shot_pipeline = pipeline("zero-shot-classification", device=0, model=zero_shot_classifier, tokenizer = zero_shot_classifier)

In [None]:
y_pred = np.zeros((len(test_dataset), len(all_emotions)))
label2id = {v:k for k,v in id2label.items()}
for i, row in enumerate(tqdm(test_df.itertuples(), total=len(test_df))):
    prediction = zero_shot_pipeline(row.text, all_emotions, multi_label=True)
    for emotion, score in zip(prediction['labels'], prediction['scores']):
        y_pred[i][label2id[emotion]] = score

y_true = test_oh_labels.copy()
prediction_threshold = 0.8

#no_neutral = y_true[:, 27] != 1
#y_true = y_true[no_neutral]
#y_pred = y_pred[no_neutral]

### Continue here

In [None]:
y_pred_thresholded = (y_pred > prediction_threshold).astype(int)

# How off are we on average?
Uses MSE

In [None]:
np.mean((y_true[y_true == 1] - y_pred[y_true == 1])**2)

In [None]:
np.mean((y_true[y_true == 0] - y_pred[y_true == 0])**2)

### What if we threshold it?

In [None]:
np.mean((y_true[y_true == 1] - y_pred_thresholded[y_true == 1])**2)

In [None]:
np.mean((y_true[y_true == 0] - y_pred_thresholded[y_true == 0])**2)

# Num different emotions in true and predict

In [None]:
output_folder = "graphics/analysis/supervised/"

In [None]:
def plot_largest_charge_distribution(true_charge_counts, pred_charge_counts, charges, output_folder):
    plt.figure()
    ax = plt.subplot(111)
    bar_width = 0.3
    ax.bar(np.array(charges)-bar_width/2, true_charge_counts, width=bar_width)
    ax.bar(np.array(charges)+bar_width/2, pred_charge_counts, width=bar_width)
    ax.set_title('Distribution of Number of Different Emotions')
    ax.legend(["true counts", "predicted counts"])
    plt.xlabel("Number of Emotions")
    plt.ylabel("Count")
    plt.savefig(output_folder + 'emotion-distribution')

def plot_largest_charge_confusion_matrix(y_true, y_pred, charges, output_folder):
    plt.figure(figsize=(20,20))
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    sns.heatmap(cm, cmap='Blues',xticklabels=charges, yticklabels=charges, annot=True, fmt=".2f")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Percent predicted, confusion matrix")
    plt.savefig(output_folder + 'confusion-matrix')

def plot_precision_recall_fscore(y_true, y_pred, charges, output_folder):
    plt.figure(figsize=(20,5))
    precision_recall_fscore = np.array(precision_recall_fscore_support(y_true, y_pred)[:-1])
    ax = sns.heatmap(precision_recall_fscore,
                     cmap='Blues',
                     xticklabels=charges,
                     yticklabels=['Precision', 'Recall', 'Fscore'],
                     annot=True)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    plt.title('Precision, Recall and Fscore for charges')
    plt.savefig(output_folder + 'prec-rec-fscore')

In [None]:
def get_label_info(input_data):
    return np.unique(np.sum(input_data, axis=1), return_counts=True)

In [None]:
different_labels_true, different_label_counts_true = get_label_info(y_true)
different_labels_pred, different_label_counts_pred = get_label_info(y_pred_thresholded)
all_x = np.array(list(set(different_labels_pred).union(different_labels_true)))

different_label_counts_true_expanded = np.zeros(len(all_x))
for i, label in enumerate(different_labels_true):
    different_label_counts_true_expanded[label] = different_label_counts_true[i]
different_label_counts_pred_expanded = np.zeros(len(all_x))
for i, label in enumerate(different_labels_pred):
    different_label_counts_pred_expanded[label] = different_label_counts_pred[i]

plot_largest_charge_distribution(different_label_counts_true_expanded,
                                 different_label_counts_pred_expanded,
                                 all_x,
                                 output_folder)

In [None]:
single_emotion_rows = np.sum(y_true, axis=1) == 1

In [None]:
# Only 1 emotion rows, take the highest prediction as score
y_true_single = np.argmax(y_true[single_emotion_rows], axis=1)
y_pred_single = np.argmax(y_pred[single_emotion_rows], axis=1)
plot_largest_charge_confusion_matrix(y_true_single, y_pred_single, all_emotions, output_folder)
plot_precision_recall_fscore(y_true_single, y_pred_single, all_emotions, output_folder)

# How do we match when we consider topk?

In [None]:
def plot_topk_matching(input_prediction, true_values, topk_max):
    any_match = []
    all_match = []
    tested_topk = range(1, topk_max)
    for topk in tested_topk:
        topk_prediction = np.argsort(input_prediction, axis=1)[:, -topk:]#np.argpartition(input_prediction, -topk)[:, -topk:]
        correct_rows_any = np.zeros(len(true_values), dtype=bool)
        correct_rows_all = np.zeros(len(true_values), dtype=bool)
        for i, row in enumerate(true_values):
            correct_rows_any[i] = np.any(np.isin(np.argwhere(row).flatten(), topk_prediction[i]))
            correct_rows_all[i] = np.all(np.isin(np.argwhere(row).flatten(), topk_prediction[i]))
        any_match.append(correct_rows_any.mean())
        all_match.append(correct_rows_all.mean())
        
    plt.plot(tested_topk, any_match, label="match any")
    plt.plot(tested_topk, all_match, label="match all")
    plt.legend()
    plt.ylabel("Percent correct")
    plt.xlabel("topk considered")
    plt.show()

In [None]:
plot_topk_matching(y_pred, y_true, 29)

y_pred_masked = y_pred.copy()
y_pred_masked[y_pred_masked < prediction_threshold] = 0
plot_topk_matching(y_pred_masked, y_true, 29)