# Transformer-based SML: Multilingual (hyper)parameter search and evaluation of best model

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import os

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.utils.class_weight import compute_class_weight

In [None]:
import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = stopwords.words('dutch') 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('dutch')

In [None]:
def remove_numbers(x):
    return re.sub(r'\d+', '', x)

def transform_lowercase(x):
    return x.lower()

def remove_punctuation(x):
    return re.sub(r'[^\w\s]|_', '', x)

def remove_stopwords(x):
    x = x.split(" ")
    x = " ".join([w for w in x if (w not in stopwords)])
    return x

def preprocess(x):
    x = remove_punctuation(x)
    x = remove_numbers(x)
    x = transform_lowercase(x)
    return x

In [None]:
# test-train split
X_train = np.load("data/train_test/X_train.npy", allow_pickle=True).tolist()
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True).tolist()
y_train = np.load("data/train_test/y_train.npy", allow_pickle=True).tolist()
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True).tolist()

In [None]:
# class balance in test and train data
print('test data:', np.bincount(y_test))
print('train data:', np.bincount(y_train))

In [None]:
# lowercase, punctuation and numbers removed
X_train = [preprocess(w) for w in X_train]
X_test = [preprocess(w) for w in X_test]

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# Random seeds
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# MODEL OF CHOICE
modelpath = 'bert-base-multilingual-cased'

In [None]:
device_name = 'cuda' # = torch.device('cuda')
max_length = 512 # This is the maximum number of tokens in any document sent to BERT.


In [None]:
#### PREPARE DATA #### 

In [None]:
# Make a custom torch mydataset class to make a train_dataset object from the text and labels 
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# train and val data set
X_trains, X_val, y_trains, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed_val)

In [None]:
print('Train set:', len(X_trains), np.bincount(y_trains))
print('Validation set:', len(X_val), np.bincount(y_val))
print('Test set:', len(X_test), np.bincount(y_test))

In [None]:
import transformers
transformers.__version__

In [None]:
# encoding
tokenizer = BertTokenizer.from_pretrained(modelpath)

In [None]:
train_encodings = tokenizer(X_trains, truncation=True, padding=True, max_length = max_length) 
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length = max_length) 
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length = max_length)

In [None]:
train_dataset = MyDataset(train_encodings, y_trains)
val_dataset = MyDataset(val_encodings, y_val)
test_dataset = MyDataset(test_encodings, y_test)

In [None]:
# based on relative importance of each class in the dataset (inverse frequency)
class_weights1 = (1 - np.bincount(y_trains)/len(y_trains))
class_weights1 = torch.from_numpy(class_weights1).float().to("cuda")
class_weights1

In [None]:
# balance the weight. (different approach)
class_weights2 = compute_class_weight(class_weight='balanced', classes=np.unique(y_trains), y=y_trains)
class_weights2 = torch.from_numpy(class_weights2).float().to("cuda")
class_weights2

In [None]:
## TRAINING

In [None]:
# Create a custom trainer based on class weights 1
class WeightedLossTrainer1(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight=class_weights1)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
class WeightedLossTrainer2(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight=class_weights2)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='weighted')
    f1_1 = f1_score(labels, preds, pos_label=1)
    prec_1 = precision_score(labels, preds, pos_label=1, zero_division=0) # divide by 0 but without printing warnings
    rec_1 = recall_score(labels, preds, pos_label=1)

    return {'macro_f1': macro_f1, 'f1' : f1, 'acc': acc, 'f1_1': f1_1, 'prec_1': prec_1, 'rec_1': rec_1}

In [None]:
def make_df(output):
    params_df = pd.DataFrame([d['params'] for d in output])
    df = pd.DataFrame(output).drop(columns='params')
    final_df = pd.concat([params_df, df], axis=1)
    return final_df

In [None]:
# configurations I want to test.

#do outside of loop: class weights, stop words, bert model
grid = {'learning_rate': [5e-5],#, 3e-5, 2e-5], 
             'batch_size':[16],
             'num_epochs':[3,4],
             'warm_up':[0,1], #,1000],
             'metric_name':['f1_1'] #['macro_f1']
        }

In [None]:
output = []
n = 0

for params in ParameterGrid(grid):
    n+=1
    # make sure to create a new folder
    os.mkdir(f"220323/multi/test{str(n)}")
    output_dir = f"220323/multi/test{str(n)}/results"
    logging_dir = f"220323/multi/test{str(n)}/logs"
    os.mkdir(output_dir)
    os.mkdir(logging_dir)
    
    # make dictionary for performance scores
    results = {'params':params}
    
    # (re)instiate model
    model = BertForSequenceClassification.from_pretrained(modelpath, num_labels=2).to('cuda')
    
    # set training arguments
    training_args = TrainingArguments(
    num_train_epochs=params['num_epochs'],              # total number of training epochs
    per_device_train_batch_size=params['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=params['batch_size'],   # batch size for evaluation
    learning_rate=params['learning_rate'],  # initial learning rate for Adam optimizer
    load_best_model_at_end=True,  # return best model after training
    save_total_limit=2, # save max two checkpoints (best and last model in this case)
    metric_for_best_model=params['metric_name'],             # best model evaluated on macro f1
    warmup_steps=params['warm_up'],               # number of warmup steps for learning rate scheduler (set lower because of small dataset size) (default=0)
    weight_decay=0.01,               # strength of weight decay
    output_dir=output_dir,          # output directory
    logging_dir=logging_dir,            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps'     # evaluate during fine-tuning so that we can see progress
    )
    
    # train train train
    trainer = WeightedLossTrainer1(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        compute_metrics=compute_metrics)
    
    trainer.train()
    
    # evaluating on validation set
    temp = trainer.evaluate()
    results.update(temp)
    print('.....RESULTS....')
    print(params)
    print(results)
    print('\n\n')
    output.append(results)

# save dictionary as pickle
pickle.dump(output, open('220323/multi_weights1.pickle', 'wb'))

In [None]:
df = make_df(output)
df.sort_values('eval_f1_1', ascending=False)

In [None]:
output2 = []
n = 49

for params in ParameterGrid(grid):
    n+=1
    # make sure to create a new folder
    os.mkdir(f"220323/multi/test{str(n)}")
    output_dir = f"220323/multi/test{str(n)}/results"
    logging_dir = f"220323/multi/test{str(n)}/logs"
    os.mkdir(output_dir)
    os.mkdir(logging_dir)
    
    # make dictionary for performance scores
    results = {'params':params}
    
    # (re)instiate model
    model = BertForSequenceClassification.from_pretrained(modelpath, num_labels=2).to('cuda')
    
    # set training arguments
    training_args = TrainingArguments(
    num_train_epochs=params['num_epochs'],              # total number of training epochs
    per_device_train_batch_size=params['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=params['batch_size'],   # batch size for evaluation
    learning_rate=params['learning_rate'],  # initial learning rate for Adam optimizer
    load_best_model_at_end=True,  # return best model after training
    save_total_limit=2, # save max two checkpoints (best and last model in this case)
    metric_for_best_model=params['metric_name'],             # best model evaluated on macro f1
    warmup_steps=params['warm_up'],               # number of warmup steps for learning rate scheduler (set lower because of small dataset size) (default=0)
    weight_decay=0.01,               # strength of weight decay
    output_dir=output_dir,          # output directory
    logging_dir=logging_dir,            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps'     # evaluate during fine-tuning so that we can see progress
    )
    
    # train train train
    trainer = WeightedLossTrainer2(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        compute_metrics=compute_metrics)
    
    trainer.train()
    
    # evaluating on validation set
    temp = trainer.evaluate()
    results.update(temp)
    print('.....RESULTS....')
    print(params)
    print(results)
    print('\n\n')
    output2.append(results)

# save dictionary as pickle
pickle.dump(output2, open('220323/multi_weights2.pickle', 'wb'))

In [None]:
df = make_df(output2)
df.sort_values('eval_f1_1', ascending=False)

In [None]:
## NO WEIGHTS 

output3 = []
n = 79

for params in ParameterGrid(grid):
    n+=1
    # make sure to create a new folder
    os.mkdir(f"220323/multi/test{str(n)}")
    output_dir = f"220323/multi/test{str(n)}/results"
    logging_dir = f"220323/multi/test{str(n)}/logs"
    os.mkdir(output_dir)
    os.mkdir(logging_dir)
    
    # make dictionary for performance scores
    results = {'params':params}
    
    # (re)instiate model
    model = BertForSequenceClassification.from_pretrained(modelpath, num_labels=2).to('cuda')
    
    # set training arguments
    training_args = TrainingArguments(
    num_train_epochs=params['num_epochs'],              # total number of training epochs
    per_device_train_batch_size=params['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=params['batch_size'],   # batch size for evaluation
    learning_rate=params['learning_rate'],  # initial learning rate for Adam optimizer
    load_best_model_at_end=True,  # return best model after training
    save_total_limit=2, # save max two checkpoints (best and last model in this case)
    metric_for_best_model=params['metric_name'],             # best model evaluated on macro f1
    warmup_steps=params['warm_up'],               # number of warmup steps for learning rate scheduler (set lower because of small dataset size) (default=0)
    weight_decay=0.01,               # strength of weight decay
    output_dir=output_dir,          # output directory
    logging_dir=logging_dir,            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps'     # evaluate during fine-tuning so that we can see progress
    )
    
    # train train train
    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        compute_metrics=compute_metrics)
    
    trainer.train()
    
    # evaluating on validation set
    temp = trainer.evaluate()
    results.update(temp)
    print('.....RESULTS....')
    print(params)
    print(results)
    print('\n\n')
    output3.append(results)

# save dictionary as pickle
pickle.dump(output3, open('220323/multi_noweights.pickle', 'wb'))

In [None]:
df = make_df(output3)
df.sort_values('eval_f1_1', ascending=False)

In [None]:
#best model based on f1 score -- test51 ---  (.62 f1, .72 prec, .54 rec): bs16, lr 5e-5, epochs 3, warmup steps 1, balanced class weights, 

In [None]:
# Load the best model and the arguments based on the validation set.
best_path = "220323/multi/test51/results/checkpoint-2000"
model = BertForSequenceClassification.from_pretrained(best_path)
arguments = torch.load(f"{best_path}/training_args.bin")

In [None]:
trainer = WeightedLossTrainer2(model=model, args=arguments)

In [None]:
# predict the test data set
preds = trainer.predict(test_dataset)
print(preds.predictions.shape)
predicted_labels = preds.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list

In [None]:
# evaluate the outcome
print(classification_report(y_test, predicted_labels))