<H1>Training BART Model on Multi-Label Sequence Generation in English</h1>
We load the data and and fine-tune a BART model on the task of <b>generating</b> a sequence of multiple labels associated with the next dialogue utterance.


<i>vers. 10/2023</i>

<h3> Data Preprocessing & Model Initialisation </h3>

First we load the appropriate dataset, process it into the proper format and initialise the model we want to fine-tune.

In [None]:
import torch
import numpy as np
import datasets
import pandas as pd
import tqdm as tqdm

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

import os
from datasets import load_dataset

In [None]:
#CHOOSE MODEL SIZE
model_size = "large" # 'base'

In [3]:
# GET DATA

if model_size == 'large':
    model_name =  "facebook/bart-large"
    output_path = "/BART_Large"

else:
    model_name =  "facebook/bart-base"
    output_path = "/BART_Base"

data = load_dataset("csv", data_files={"train":"/data/daily_dialog_train_next.csv", "validation":"/data/daily_dialog_val_next.csv", "test": "/data/en/daily_dialog_test_next.csv"})
train_data_txt, validation_data_txt = data['train'], data['test']

In [None]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)
    os.makedirs(output_path + '/results')

In [4]:
#LOAD MODEL AND TOKENIZER
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
#ACTIVATE CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

#check the device
device = torch.device(0)
model = model.to(device)
print(model.device)

cuda:0


In [6]:
#TOKENIZER PARAMETERS

encoder_max_length = 256  
decoder_max_length = 64

In [7]:
#GET LABELS, AND LENGTH OF LABEL SEQUENCE
label_vals = [x.split('+') for x in test_data_txt['label']]
li_val = [len(tokenizer(x).input_ids) for x in test_data_txt['label']]
lens_li_labels=[len(x) for x in label_vals]

In [None]:
#PEEK AT THE DATA
example = data['train'][0]
example

In [9]:
#PREPROCESS BATCH OF DATA

def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["text"], batch["label"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [10]:
#PREPROCESS THE DATASETS

train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

In [11]:
#PREPROCESS THE TEXT

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


<h3>Training Arguments and Metrics</h3>
We then set the training hyper-parameters such as batch-size or number of epochs, and then we define the metrics we will compute during training and evaluation of our model.

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import torch

In [12]:
#SET THE ARGUMENTS AND HYPER-PARAMETERS

nltk.download("punkt", quiet=True)
metric_name = "f1"

training_args = Seq2SeqTrainingArguments(
    output_dir= output_path + "/results",
    num_train_epochs=10,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    gradient_accumulation_steps = 2,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir= output_path + "/logs",
    logging_steps=50,
    save_total_limit=2,
    metric_for_best_model=metric_name
)

In [None]:
# START WANDB SESSION TO KEEP TRACK OF TRAINING 

import wandb
wandb_run = wandb.init(
    project="bart",
    config={
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "learning_rate": training_args.learning_rate,
        "dataset": dataset_name,
    },
)

wandb_run.name = "run_" + "bart"  +  "_" model_size

In [13]:
#COLLATE DATA

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
#MULTILABEL METRICS
# while training, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    f1_micro_average = f1_score(y_true=decoded_labels, y_pred=decoded_preds, average='micro')
    roc_auc = roc_auc_score(decoded_labels, decoded_preds, average = 'micro')
    accuracy = accuracy_score(decoded_labels, decoded_preds)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
            'roc_auc': roc_auc,
            'accuracy': accuracy}
    return metrics

<h3>Training</h3>
Time to train and evaluate the model!

In [15]:
#Initialise trainer module


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


'\ntrainer = Seq2SeqTrainer(\n    model=model,\n    args=training_args,\n    data_collator=data_collator,\n    train_dataset=train_data,\n    eval_dataset=validation_data,\n    tokenizer=tokenizer,\n    compute_metrics=compute_metrics,\n)\n'

In [16]:
#TRAIN!

trainer.train()

In [17]:
#EVALUATION ON VALIDATION SET

trainer.evaluate()

<h3>Prediction and Inference</h3>

Now that our model has been fine-tuned, we can use it to predict labels on brand new data it has never seen before.

In [18]:
#PEEK INTO TEST DATA

test = data['test'][0]
test 

{'text': 'Hey man , you wanna buy some weed ? ', 'label': 'question+surprise'}

In [19]:

#LOAD THE MODEL
#Setting `problem_type` to be "multi_label_classification" makes sure we use the appropriate loss function, BCEWithLogitsLoss
#The output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

from transformers import AutoConfig, AutoModelForSequenceClassification
import numpy as np

LOAD_MODEL = False

if LOAD_MODEL:
    model_name_or_path = '/path/to/model'

    config = AutoConfig.from_pretrained(model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

else:
    model = trainer.model


device = torch.device(0)
model = model.to(device)
t = model.eval()


In [38]:
#PREDICT/INFERENCE
from tqdm import tqdm

def generate_summary_k(test_samples, li, model, k_generation=False):
    samples = test_samples["text"]
    generated_output_str = []
    lens = []

    for i in tqdm(range(len(samples))):
        inputs = tokenizer(
            samples[i],
            padding="max_length",
            truncation=True,
            max_length=encoder_max_length,
            return_tensors="pt",
        )
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        if k_generation:
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=li[i]+2, min_new_tokens=li[i]+2)
        else:
            outputs = model.generate(input_ids, attention_mask=attention_mask)
        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        output_str = output_str[0].replace('+', ', ')
        generated_output_str.append(output_str)
        lens.append(len(outputs))
    return generated_output_str, lens

In [None]:
test_samples = test_data_txt
labels_after_tuning, lens_labels = generate_summary_k(test_samples, li_val, model, k_generation=True)

In [None]:
labels_after_tuning[:10]

In [None]:
print(lens_li_labels[:30])

In [37]:
#SAVE RESULTS
test_df = pd.DataFrame(
    {'segment': test_samples["text"], 
    'reference':test_samples["label"], 
    'expected li':lens_li_labels, 
    'length':lens_labels, 
    'hypothese': labels_after_tuning}
    )

test_df.to_csv(output_path +'/results/results.csv', index = False, encoding = 'UTF-8')