##NOTE:

To run this notebook, the file 'final_dataset.csv' is needed.

In [None]:
!pip install datasets
!pip install transformers

In [None]:
!huggingface-cli login

In [None]:
from datasets import load_dataset
import pandas as pd

#### Read in data set

In [None]:
from datasets import Dataset, Value, ClassLabel, Features, DatasetDict
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('final_dataset.csv')
df.columns = ['sentence', 'original_label', 'thread_id', 'comment_id', 'label']

df = df.drop_duplicates(subset=['thread_id', 'sentence'])

#### Split data into training, validation and test set

In [None]:
# Select test and validation thread ids
test_ids = ['t3_6rwcio', 't3_5jfqhp', 't3_71l9yj', 't3_4mj8v7', 't3_58t7i3',
            't3_64kkxe', 't3_6ihcuk', 't3_5o7nm3', 't3_4tf91m', 't3_4q9qng']
val_ids = ['t3_5ep0mh', 't3_4pbwvb', 't3_4g3nbn', 't3_6tsx1p', 't3_62igvv',
           't3_6694ui', 't3_6h7a4i', 't3_4plwqq', 't3_4otmqi', 't3_57tl4k']

# Encode labels into integers
labels = df.label.unique()
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

label2id = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
id2label = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

label2id = {label:int(id) for label,id in label2id.items()}
id2label = {int(id):label for id,label in id2label.items()}

train_df = df[~(df['thread_id'].isin((test_ids + val_ids)))][['sentence', 'label']]
val_df = df[df['thread_id'].isin(val_ids)][['sentence', 'label']]
test_df = df[df['thread_id'].isin(test_ids)][['sentence', 'label']]

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

evidence_dataset = DatasetDict({'train': train_ds, 'validation': val_ds, 'test': test_ds})

# Tokenizing

In [None]:
from transformers import AutoTokenizer

model_name = 'microsoft/MiniLM-L12-H384-uncased'
# model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_sentences(sentences):
    return tokenizer(sentences['sentence'], truncation=True, max_length=512)

In [None]:
evidence_dataset = evidence_dataset.map(tokenize_sentences, batched=True)

# Adding weighted loss function

In [None]:
import torch
class_weights = (1 - (df['label'].value_counts().sort_index() / len(df))).values
class_weights = torch.from_numpy(class_weights).float().to('cuda')

tensor([0.8901, 0.3986, 0.9784, 0.7569, 0.9849, 0.9911], device='cuda:0')

In [None]:
evidence_dataset = evidence_dataset.rename_column('label', 'labels')

In [None]:
from torch import nn
from transformers import Trainer

class TrainerImbalancedData(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Overwrite compute_loss function to use class weights
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        loss_function = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_function(logits, labels)
        if return_outputs:
            return (loss, outputs)
        else:
            return loss

In [None]:
from transformers import AutoModelForSequenceClassification
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels), id2label=id2label, label2id=label2id)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score
predictions_df = pd.DataFrame()
predictions_df['true'] = val_df.label
def compute_metrics(predictions):
    labels = predictions.label_ids    
    predictions = predictions.predictions.argmax(-1)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    b_acc = balanced_accuracy_score(labels, predictions)
    predictions_df[len(predictions_df.columns)] = predictions
    return { 'macro f1': f1_macro, 'weighted f1': f1_weighted, 'accuracy': acc, 'balanced accuracy': b_acc}

In [None]:
from transformers import TrainingArguments

batch_size = 32
epochs = 20
output_dir = 'MiniLM-evidence-types'
# output_dir = 'BERT-evidence-types'
training_args = TrainingArguments(output_dir = output_dir,
                                  num_train_epochs=epochs,
                                  learning_rate=3e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  metric_for_best_model='macro f1',
                                  fp16=True,
                                  seed=42)

In [None]:
trainer = TrainerImbalancedData(model_init=model_init,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=evidence_dataset['train'],
                  eval_dataset=evidence_dataset['validation'],
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
# predictions_df.replace(id2label).to_csv('predictions_MiniLM_bs32_lr2e-5.csv', index=False)

In [None]:
trainer.push_to_hub()