In [1]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import csv
import torch
import gc
from numba import cuda
from sklearn.metrics import accuracy_score

#### Load data and encoding class labels
    - Can encode as multi-class or binary training/validation data.

In [2]:
def load_data(file, relevant_cols, one_class=""):
    '''
    Load data from file, retrieve only the relevant columns into a Pandas DataFrame.
    Concatentate all text datas from relevent columns in to a "text" column in Data-
    Frame.
    
    Args:
        file: Path to the data file
        relevant_cols: Relevant columns (features) to be used
        one_class:
            default:- None. 3 labels will be encoded to 0,1, and 2 respectively.
                    - If a label is given, convert the data to binary-labeled,
                      with the given class is "1", the rest is "0".
    Returns:
        df: a Pandas DataFrame containing the relevant data
    '''
    df = pd.read_json(file, lines=True)
    df["text"] = ""
    for col in relevant_cols:
        df["text"] += df[col].explode() + ". "
    df = df.loc[:, ["text", "tags"]]
    df.tags = df.tags.explode()
    df.tags = df.tags.apply(label_encoding, args=(one_class, ))
    return df


def label_encoding(text, one_class=""):
    '''
    Encode a class label from String to Integer
    
    Args:
        text: Label that needs to be encoded
        one_class:
            default:- None. 3 labels will be encoded to 1,2, and 3 respectively.
                    - If a label is given, convert the data to binary-labeled,
                      with the given class is "1", the rest is "0".
            
    Returns:
        Encoding according to label given.
    '''
    if one_class:
        if text == one_class:
            return 1
        else:
            return 0
    else:
        if text == "phrase":
            return 0
        elif text == "passage":
            return 1
        elif text == "multi":
            return 2
        else:
            return text    

#### Set up configurations for training transformer model
    - Because of time constraints, two hyper parameters selected in validation is: learning rate and warm up ratio
    - Warm-up is a way to reduce the primacy effect of the early training examples. The learning rate is increased linearly over the warm-up period. This often leads to faster training time.

In [3]:
def make_config(model_type, learn_rate, warmup, epochs, batch_size):
    '''
    Make a list of configurations given a list of learning rate and warmup ratio
    
    Args:
        learn_rate: a list of learning rate
        warmup: a list of warmup ratio
        
    Returns:
        configurations: a list of configurations
    '''
    configurations = []

    for learning_rate in learn_rate:
        for warmup_ratio in warmup:
            for epoch in epochs:
                for batch in batch_size:
                    configurations += [{"overwrite_output_dir": True,
                                        "num_train_epochs": epoch,
                                        "learning_rate": learning_rate,
                                        "warmup_ratio": warmup_ratio,
                                        "train_batch_size" : batch,
                                        "best_model_dir": fr"outputs/{model_type}/best_model",
                                        "output_dir" : "outputs/"
                                        }]
    return configurations

#### Train model with a given set of configurations.
    - The model being train is a base version of deberta, downloaded from Hugging Face using the simpletransformers library.
    - num_labels set to 3 for fine-tuning with our dataset.
    - ignore_mismatched_sizes=True is needed if the model_type and the model itself have some discrepancies.
    - delete model and flush CUDA memory after training each model to preven out of memory error.

In [4]:
def train_model(train_set, validation_set, model_type, model_name, configurations, num_labels=3):
    '''
    Train the model for different configurations
    
    Args:
        train_set: training data in DataFrame format
        validation_set: validation data in DataFrame format
        model_type: type of model to train
        model_name: name of the model on Hugging Face
        configurations: a list of configurations to train the model on
        
    Returns:
        results: List containing the evaluation accuracy results
    '''
    results = []

    for config in configurations:

        config["output_dir"] = fr"outputs/{model_type}/{model_type}_" + \
                                str(config["learning_rate"]) + "_" + \
                                str(config["warmup_ratio"]) + "_e" + \
                                str(config["num_train_epochs"]) + "_b" + \
                                str(config["train_batch_size"])
        model = ClassificationModel(model_type, model_name, num_labels=num_labels, args=config, ignore_mismatched_sizes=True)
        model.train_model(train_set, eval_df=validation_set, acc=accuracy_score)
        train = model.eval_model(train_set, acc=accuracy_score)
        evaluation = model.eval_model(validation_set, acc=accuracy_score)

        results += [(config["learning_rate"], config["warmup_ratio"], train[0], evaluation[0])]
        model = None
        gc.collect()
        torch.cuda.empty_cache()
    return results

#### Model Selection using Validation Set
    - The best model is selected using accuracy as validation metric.
    - All model results are then saved to a dataframe and a csv file.

In [5]:
def result_table(model_type, configurations):
    '''
    Retrieve results from eval_results.txt in outputs/{model_name}
    
    Args:
        model_type: model type that was trained
        configurations: configurations that were used to trained the model
        
    Returns:
        df: DataFrame containing the accuracy metric results
    '''
    results_dict = {"model_name" : [],
                    "learning_rate" : [],
                    "warmup_ratio" : [],
                    "num_epochs" : [],
                    "batch_size" : [],
                    "acc" : [],
                    "eval_loss" : []
                   }
    for config in configurations:
        eval_file = fr"outputs/{model_type}/{model_type}_" + \
                    str(config["learning_rate"]) + "_" + \
                    str(config["warmup_ratio"]) + "_e" + \
                    str(config["num_train_epochs"]) + "_b" + \
                    str(config["train_batch_size"]) +"/eval_results.txt"
        with open(eval_file, "r+") as file:
            data = file.readlines()
            results_dict["model_name"].append(f"{model_type}")
            results_dict["learning_rate"].append(config["learning_rate"])
            results_dict["warmup_ratio"].append(config["warmup_ratio"])
            results_dict["num_epochs"].append(config["num_train_epochs"])
            results_dict["batch_size"].append(config["train_batch_size"])
            results_dict["acc"].append(data[0][6:-1])
            results_dict["eval_loss"].append(data[1][12:-1])
    df = pd.DataFrame.from_dict(results_dict)
    df.to_csv(f"{model_type}_validation_results.csv")
    df_style = df.style.highlight_max(subset="acc", color="aquamarine", axis=0)
    display(df_style)
    return df

#### Training and Evaluation Pipeline function:

In [6]:
def train_eval_pipeline(train_data, 
                        validation_data,
                        columns,
                        model_type,
                        model_name,
                        num_labels=3,
                        epochs=[10],
                        batch_size=[8],
                        one_class="",
                        learn_rate = [4e-6, 1e-5, 4e-5, 1e-4],
                        warmup = [0.02, 0.06, 0.1],
                        ):
    '''
    Training and evaluation Pipeline:
    
    Args:
        - train_data: Path to training data file (Accept .jsonl file)
        - validation_data: Path to valdation data file (Accept .jsonl file)
        - columns: Select relevant columns (features) used to train models ("postText", "targetTitle" or both)
        - model_type: Select model type for simpletransformers library ("bert", "distilbert", "deberta")
        - model_url: model name from Hugging Face (must be of the same type as model_type). For example:
            + model_type: "bert" - model_name:"bert-base-cased"
            + model_type: "distillbert" - model_name:"distilbert-base-cased"
            + model_type: "deberta" - model_name:"microsoft/deberta-base"
        - epochs: Number of epochs to train.
        - batch_size: Batch size
        - learn_rate: A list of learning rate to train the models on
        - warmup: A list of warm up ratio to train the models on
        
    Returns :
        - dataframe: DataFrame containing the results.
    '''
    if one_class:
        num_labels = 2
    train_set = load_data(train_data, columns, one_class)
    validation_set = load_data(validation_data, columns, one_class)
    
    configurations = make_config(model_type=model_type, learn_rate=learn_rate, warmup=warmup, epochs=epochs, batch_size=batch_size)
    results = train_model(train_set, validation_set, model_type, model_name, configurations, num_labels)
    dataframe = result_table(model_type, configurations)
    return dataframe
