<H1>Training BERT Model on Multi-Label Classification in English</h1>
We load the data and fine-tune a BERT model on the tasks of predicting multiple labels on the current or next utterance.


<i>vers. 10/2023</i>

<h3> Data Preprocessing & Model Initialisation </h3>

First we load the appropriate dataset, process it into the proper format and initialise the model we want to fine-tune.

In [15]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

from torch import cuda, device
import torch
import os 

In [16]:
# choose model size and task

task = 'next'  # "current" #  predict labels for current / next utterance

model_size = 'large'  # "base" 

In [None]:
#LOAD THE RIGHT DATA AND GET THE MODEL NAME, OUTPUT PATH
dataset_name = 'carglass'

if language == "current":
    dataset = load_dataset("csv", data_files={"train":"/data/daily_dialog_train_ohe.csv", 'validation':'data/daily_dialog_val_ohe.csv', "test": "/data/daily_dialog_test_ohe.csv"})
    output_path = ""

    if model_size == 'large':
        model_name = 'bert-large-cased'
        output_path = ".../Filter Rerank/Bert_Large_Current"


    else:
        model_name = 'bert-base-cased'
        output_path = ".../Filter Rerank/Bert_Base_Current"

        
else:
    dataset = load_dataset("csv", data_files={"train":"/data/daily_dialog_train_next_ohe.csv", 'validation':'/data/daily_dialog_val_next_ohe.csv', "test": "/data/daily_dialog_test_next_ohe.csv"})
    output_path = "path/to/model"

    if model_size == 'large':
        model_name = 'bert-large-cased'
        output_path = "Bert_Large_Next"


    else:
        model_name = 'bert-base-cased'
        output_path = "Bert_Base_Next"

In [None]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)
    os.makedirs(output_path + '/results')

In [18]:
#ACTIVATE CUDA

os.environ["CUDA_VISIBLE_DEVICES"] = "3"


In [None]:
#PEEK AT THE DATA
example = dataset['train'][0]
example

In [None]:
#GET LABELS

labels = [label for label in dataset['train'].features.keys() if label not in ['input']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

In [21]:
#LOAD TOKENIZER
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["input"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [None]:
#PREPROCESS THE DATASETS
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

example = encoded_dataset['train'][0]
print(example.keys())

tokenizer.decode(example['input_ids'])

#example['labels']

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

In [25]:
#SET THE FORMAT AS PYTORCH TENSORS, TO OBTAIN PYTORCH DATASETS

encoded_dataset.set_format("torch")

In [None]:
#LOAD THE MODEL
#Setting `problem_type` to be "multi_label_classification" makes sure we use the appropriate loss function, BCEWithLogitsLoss
#The output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                        problem_type="multi_label_classification", 
                                                        num_labels=len(labels),
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        hidden_dropout_prob=0.5)

#check the device
device = torch.device(0)
model = model.to(device)
print(model.device)

<h3>Training Arguments and Metrics</h3>
We then set the training hyper-parameters such as batch-size or number of epochs, and then we define the metrics we will compute during training and evaluation of our model.

In [27]:
#SET THE ARGUMENTS AND HYPER-PARAMETERS

batch_size = 16
metric_name = "f1"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_path,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    logging_steps=20,
    weight_decay=0.01,
    gradient_accumulation_steps = 2,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit = 2,
    #push_to_hub=True,
)

In [None]:
# START WANDB SESSION

wandb.init(
    project="bert",
    config={
        "per_device_train_batch_size": args.per_device_train_batch_size,
        "learning_rate": args.learning_rate,
        "dataset": dataset_name,
    },
)

wandb_run.name = "run_" + "bert" + "_" + model_size + '_' + task

In [None]:
#MULTILABEL METRICS
# while training, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values


from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
            'roc_auc': roc_auc,
            'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
#verification

print(encoded_dataset['train'][0]['labels'].type())

print(encoded_dataset['train']['input_ids'][0])

outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0).to(device), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0).to(device))
outputs

<h3>Training</h3>
Time to train and evaluate the model!

In [None]:
#Initialise trainer module

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
#TRAIN!
trainer.train()

In [None]:
#EVALUATION ON VALIDATION SET

trainer.evaluate()

<h3>Test</h3>
We test the newly trained model on a sentence and peek into the outputs and how to transform the raw logits into actual predicted labels.

In [None]:
#Test on a sentence

text = "allez-y madame je vous écoute, que puis-je faire pour vous?"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
#PEEK INTO THE OUTPUT
logits = outputs.logits
logits.shape

In [None]:
#PRINT LOGITS
logits

In [None]:
#TURN LOGITS INTO PREDICTIONS AND LABELS

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

<h3>Prediction and Inference</h3>

Now that our model has been fine-tuned, we can use it to predict labels on brand new data it has never seen before.

In [None]:
#PEEK INTO TEST DATA

test = dataset['test'][0]
test

In [None]:
#LOAD THE MODEL
#Setting `problem_type` to be "multi_label_classification" makes sure we use the appropriate loss function, BCEWithLogitsLoss
#The output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

LOAD_MODEL = False

if LOAD_MODEM:
    model_name_or_path = '/path/to/model'
    config = AutoConfig.from_pretrained(model_name_or_path)
    id2label = [config.id2label[key] for key in sorted(config.id2label.keys(), key=lambda t: int(t))]
    id2label = np.asarray(id2label)

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)

else:
    model = trainer.model

device = torch.device(0)
model = model.to(device)
t = model.eval()


In [None]:
def predict_sentence(text, k=None, verbose=False):

    features = tokenizer(text, return_tensors="pt", truncation=True)
    features = features.to(device)

    with torch.inference_mode():
        outputs = model(**features)
        logits = outputs[0]
        logits = logits.sigmoid()

    logits = logits.detach().cpu().numpy()

    # sort rst by desceding order
    pred_scores = np.sort(logits)[:, ::-1]
    pred_ids = np.argsort(logits)[:, ::-1]

    pred_scores = pred_scores[0]
    pred_labels = id2label[pred_ids[0]]
    
    if k is not None:
        pred_scores = pred_scores[:k]
        pred_labels = pred_labels[:k]
    
    if verbose:
        print(f'"{text}"')
        for i, (s, l) in enumerate(zip(pred_scores, pred_labels)):
            print(f"{l:30} : {s}")
        print()
    
    return pred_labels, pred_scores

In [None]:
def predict_set(test_data, k=None, verbose=False):
  results_dic = {'input':[],'reference':[]} 
  for label in id2label:
    results_dic[label] = []
  for data in tqdm(test_data):
      text = data['input']
      results_dic['input'].append(text)
      results_dic['reference'].append('<SEP>'.join([x for x in id2label if data[x] == 1]))
      pred_label, pred_score = predict_sentence(text, k, verbose)
      pred_label = list(pred_label)
      for label in id2label:
          idx = pred_label.index(label)
          results_dic[label].append(pred_score[idx])
  return results_dic

In [None]:
results = predict_set(dataset['test'])

In [None]:
print(results['reference'])

In [None]:
#SAVE RESULTS
pd.DataFrame(results).to_csv(output_path+'/results/results_socemo_multilabel_window3.csv', encoding = 'UTF-8', index = False)