# Replication Pipeline for Fine-Tuning Results

This notebook contains the code to reproduce the fine-tuned LLM results reported in our paper. To run the code, execute all code blocks. Because for each dataset each model is trained three times using different random seeds, execution will take a considerable amount of time. See our note on Google Colab in the accompanying `README` file for an option to speed up execution. 

### Preliminaries:

In [None]:
# Install required packages (only required if not already installed)
# !pip install sentencepiece
# !pip install pandas
# !pip install numpy
# !pip install wandb
# !pip install scikit-learn
# !pip install torch
# !pip install torchmetrics
# !pip install transformers
# !pip install tqdm

In [None]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8
%env TOKENIZERS_PARALLELISM=false

# Import standard Python packages
import pandas as pd
import numpy as np
import pickle
import shutil
import glob
import time
import os
import gc

# Import deep learning packages
import torch
torch.backends.cuda.matmul.allow_tf32 = True
import wandb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

# Import pipeline code
from src.finetuning import train_and_predict_test, init_model, predict_y_from_trained_model, read_x_from_csv, init_misc, compute_and_print_metrics_for_dataset_b, set_seeds

### Loop through data sets and models to create results for tables 1-4

To run the code selectively for individual datasets and/or models, adjust the `range()` arguments for the `ds` and `lm` iterators. For example to run the code for only `DATASET = "02-twitter-stance"` and `LANGUAGE_MODEL = "DEB-V3"`, set `for ds in range(2,3)` and `for lm in range(3,4)`.

To run the German language model for case study 3, switch from `dataset_sentences = f"./data/{DATASET}/all-x-translated.csv"` to the non-translated dataset (in the preceeding line of code), switch to `LANGUAGE_FOR_MODEL="de"`, and activate `LANGUAGE_MODEL = "ELE-BS-GER"`.

In [None]:
# Loop through datasets
for ds in range(1,5):

    # ************************************************ #
    # Choose dataset for reproduction of paper results
    # ************************************************ #

    # case study 1
    if ds == 1:
        DATASET = "01-nyt-sentiment"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    # case study 2
    if ds == 2:
        DATASET = "02-twitter-stance"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    # case study 3
    if ds == 3:
        DATASET = "03-emotion-angry"
        #dataset_sentences = f"./data/{DATASET}/all-x.csv"
        dataset_sentences = f"./data/{DATASET}/all-x-translated.csv"

    # ************************************************ #

    # case study 4
    if ds == 4:
        DATASET = "04-brexit-stance"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    LANGUAGE_FOR_MODEL="en"
    #LANGUAGE_FOR_MODEL="de"

    # ************************************************ #

    # Loop through language models
    for lm in range(1,6):

        if lm == 1: LANGUAGE_MODEL = "ROB-BASE"
        # https://huggingface.co/roberta-base

        if lm == 2: LANGUAGE_MODEL = "ROB-LRG"
        # https://huggingface.co/roberta-large

        if lm == 3: LANGUAGE_MODEL = "DEB-V3"
        # https://huggingface.co/microsoft/deberta-v3-large

        if lm == 4: LANGUAGE_MODEL = "ELE-LRG"
        # https://huggingface.co/google/electra-large-discriminator

        if lm == 5: LANGUAGE_MODEL = "XLNET-LRG"
        # https://huggingface.co/xlnet-large-cased

        # LANGUAGE_MODEL = "ELE-BS-GER"
        # To use the electra base model in german,
        # set LANGUAGE_FOR_MODEL="de" above.
        # Also choose non-translated data for # case study 3.
        # https://huggingface.co/german-nlp-group/electra-base-german-uncased

        # For BART and ChatGPT, see separate notebooks.

        # To use a custom model from Huggingface,
        # set the model ID with the following variable:

        CUSTOM_MODEL_NAME = None

        # ************************************************ #

        RUN_ID = DATASET + "-" + LANGUAGE_MODEL + "-" + LANGUAGE_FOR_MODEL

        # ************************************************ #

        RAND_SEED = 1234

        N_EPOCHS = 10

        if lm == 3:
            BATCH_SIZE = 2
        else:
            BATCH_SIZE = 4

        GRADIENT_ACC_STEPS = 8

        DROPOUT_RATE = 0.1

        LEARNING_RATE = 1e-5

        IMBALANCE_STRATEGY = 'loss_weight'

        IS_DEBUG_ENABLED = True

        DO_VALIDATION_SET = False

        # ************************************************ #

        dataset_labels = f"./data/{DATASET}/all-y.csv"

        all_x = np.squeeze(np.array(pd.read_csv(dataset_sentences, header=None, sep='\t\t', engine='python')))
        all_y = np.squeeze(np.array(pd.read_csv(dataset_labels, dtype=np.float32, header=None)))

        os.makedirs(f'./data/{DATASET}/{RUN_ID}', exist_ok=True)

        print(all_x.shape, all_y.shape)

        # ************************************************ #

        # Option 1: no external logging of the training metrics
        IS_LOGGING_ENABLED = False
        wandb_config = None

        # Option 2: external logging of the training metrics (for finetuned analysis and optimization of hyperparameters)
        # IS_LOGGING_ENABLED = True
        # wandb_config = { "project": "ipz-nlp", "entity": "mnbucher" }

        # ************************************************ #

        # random shuffling of loaded dataset
        set_seeds(RAND_SEED)
        idxs_shuffle = np.arange(all_x.shape[0])
        np.random.shuffle(idxs_shuffle)
        all_x = all_x[idxs_shuffle]
        all_y = all_y[idxs_shuffle]

        # test set - fix split with N=200
        N_SUBSET_FOR_B = 200
        mask_b = np.zeros(all_x.shape[0], dtype=bool)
        idxs_b = np.random.choice(np.arange(all_x.shape[0]), replace=False, size=N_SUBSET_FOR_B)
        mask_b[idxs_b] = True
        dataset_B_unlabelled_x = all_x[mask_b]
        dataset_B_unlabelled_y = all_y[mask_b]
        np.savetxt(f"./data/{DATASET}/{RUN_ID}/dataset-b-x.csv", dataset_B_unlabelled_x, "%s", encoding="utf-8") ### NEW
        np.savetxt(f"./data/{DATASET}/{RUN_ID}/dataset-b-y-true.csv", dataset_B_unlabelled_y, encoding="utf-8") ### NEW

        # training set
        mask_a = np.ones(all_x.shape[0], dtype=bool)
        mask_a[idxs_b] = False
        dataset_A_labelled_x = all_x[mask_a]
        dataset_A_labelled_y = all_y[mask_a]

        print("")
        print("dataset full size: ", all_x.shape)
        print("")
        print("dataset_B_unlabelled_x: ", dataset_B_unlabelled_x.shape)
        print("dataset_B_unlabelled_y: ", dataset_B_unlabelled_y.shape)
        print("")
        print("dataset_A_labelled_x: ", dataset_A_labelled_x.shape)
        print("dataset_A_labelled_y: ", dataset_A_labelled_y.shape)
        print("")


        ## A: Full Training Datasets

        # ************************************************ #

        RAND_SEED = 1234
        set_seeds(RAND_SEED)
        dataset_B_unlabelled_y_preds_seed_1234 = train_and_predict_test(dataset_A_labelled_x, dataset_A_labelled_y, RUN_ID, N_EPOCHS, IMBALANCE_STRATEGY, dataset_B_unlabelled_x, learning_rate=LEARNING_RATE, dropout_rate=DROPOUT_RATE, batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACC_STEPS, rand_seed=RAND_SEED, language_model=LANGUAGE_MODEL, language_for_model=LANGUAGE_FOR_MODEL, custom_model_name=CUSTOM_MODEL_NAME, do_validation_set=DO_VALIDATION_SET, log_with_wandb=IS_LOGGING_ENABLED, is_debug=IS_DEBUG_ENABLED, wandb_config=wandb_config)
        np.savetxt(f"y_pred_1234.csv", dataset_B_unlabelled_y_preds_seed_1234, delimiter=",")
        np.savetxt(f"./output/{RUN_ID}_predictions-1234.csv", dataset_B_unlabelled_y_preds_seed_1234, fmt='%f')

        gc.collect()

        RAND_SEED = 3456
        set_seeds(RAND_SEED)
        dataset_B_unlabelled_y_preds_seed_3456 = train_and_predict_test(dataset_A_labelled_x, dataset_A_labelled_y, RUN_ID, N_EPOCHS, IMBALANCE_STRATEGY, dataset_B_unlabelled_x, learning_rate=LEARNING_RATE, dropout_rate=DROPOUT_RATE, batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACC_STEPS, rand_seed=RAND_SEED, language_model=LANGUAGE_MODEL, language_for_model=LANGUAGE_FOR_MODEL, custom_model_name=CUSTOM_MODEL_NAME, do_validation_set=DO_VALIDATION_SET, log_with_wandb=IS_LOGGING_ENABLED, is_debug=IS_DEBUG_ENABLED, wandb_config=wandb_config)
        np.savetxt(f"y_pred_3456.csv", dataset_B_unlabelled_y_preds_seed_3456, delimiter=",")
        np.savetxt(f"./output/{RUN_ID}_predictions-3456.csv", dataset_B_unlabelled_y_preds_seed_3456, fmt='%f')

        gc.collect()

        RAND_SEED = 5678
        set_seeds(RAND_SEED)
        dataset_B_unlabelled_y_preds_seed_5678 = train_and_predict_test(dataset_A_labelled_x, dataset_A_labelled_y, RUN_ID, N_EPOCHS, IMBALANCE_STRATEGY, dataset_B_unlabelled_x, learning_rate=LEARNING_RATE, dropout_rate=DROPOUT_RATE, batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACC_STEPS, rand_seed=RAND_SEED, language_model=LANGUAGE_MODEL, language_for_model=LANGUAGE_FOR_MODEL, custom_model_name=CUSTOM_MODEL_NAME, do_validation_set=DO_VALIDATION_SET, log_with_wandb=IS_LOGGING_ENABLED, is_debug=IS_DEBUG_ENABLED, wandb_config=wandb_config)
        np.savetxt(f"y_pred_5678.csv", dataset_B_unlabelled_y_preds_seed_5678, delimiter=",")
        np.savetxt(f"./output/{RUN_ID}_predictions-5678.csv", dataset_B_unlabelled_y_preds_seed_5678, fmt='%f')

        gc.collect()

        # ************************************************ #

        y_pred_1234 = np.squeeze(np.asarray(pd.read_csv('y_pred_1234.csv', sep=',', header=None)))
        y_pred_3456 = np.squeeze(np.asarray(pd.read_csv('y_pred_3456.csv', sep=',', header=None)))
        y_pred_5678 = np.squeeze(np.asarray(pd.read_csv('y_pred_5678.csv', sep=',', header=None)))

        print(dataset_B_unlabelled_y.shape, y_pred_1234.shape, y_pred_3456.shape, y_pred_5678.shape)

        y_preds = [ y_pred_1234, y_pred_3456, y_pred_5678 ]

        compute_and_print_metrics_for_dataset_b(dataset_B_unlabelled_y, y_preds, None, RAND_SEED, False, True, dataset_name = RUN_ID)

        # clean up
        files = [ f for f in glob.glob('./output/*.csv') ] 
        files_dest = [ f.replace("/output/", "/results/") for f in glob.glob('./output/*.csv') ]
        for f in range(len(files)):
            shutil.move(files[f], files_dest[f])

        files_r1 = [ f for f in glob.glob('./*.csv') ]
        files_r2 = [ f for f in glob.glob('./*.txt') ]
        files_r = files_r1 + files_r2
        for f in files_r:
            os.remove(f)

### Create results for section "Fine-Tuning: The Effect of Training Set Size on Model Performance"

The following code loops through all dataset and produces results for `LANGUAGE_MODEL = "ROB-LRG"` based on varying training dataset sizes (50, 100, 200, 500, 1000). We preserve the option to activate any of the other models via the range arguments for `lm`. 

Also see the notebook `plot_ablation_study.ipynb` in the `figures` folder, which creates the plots in Figure 4 based on the results of the below code.

In [None]:
# Loop through datasets
for ds in range(1,5):

    # ************************************************ #
    # Choose dataset for reproduction of paper results
    # ************************************************ #

    # case study 1
    if ds == 1:
        DATASET = "01-nyt-sentiment"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    # case study 2
    if ds == 2:
        DATASET = "02-twitter-stance"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    # case study 3
    if ds == 3:
        DATASET = "03-emotion-angry"
        #dataset_sentences = f"./data/{DATASET}/all-x.csv"
        dataset_sentences = f"./data/{DATASET}/all-x-translated.csv"

    # ************************************************ #

    # case study 4
    if ds == 4:
        DATASET = "04-brexit-stance"
        dataset_sentences = f"./data/{DATASET}/all-x.csv"

    # ************************************************ #

    LANGUAGE_FOR_MODEL="en"
    #LANGUAGE_FOR_MODEL="de"

    # ************************************************ #

    # Loop through language models
    for lm in range(2,3):

        if lm == 1: LANGUAGE_MODEL = "ROB-BASE"
        # https://huggingface.co/roberta-base

        if lm == 2: LANGUAGE_MODEL = "ROB-LRG"
        # https://huggingface.co/roberta-large

        if lm == 3: LANGUAGE_MODEL = "DEB-V3"
        # https://huggingface.co/microsoft/deberta-v3-large

        if lm == 4: LANGUAGE_MODEL = "ELE-LRG"
        # https://huggingface.co/google/electra-large-discriminator

        if lm == 5: LANGUAGE_MODEL = "XLNET-LRG"
        # https://huggingface.co/xlnet-large-cased

        # LANGUAGE_MODEL = "ELE-BS-GER"
        # To use the electra base model in german,
        # set LANGUAGE_FOR_MODEL="de" above.
        # https://huggingface.co/german-nlp-group/electra-base-german-uncased

        # For BART and ChatGPT, see separate notebooks.

        # To use a custom model from Huggingface,
        # set the model ID with the following variable:

        CUSTOM_MODEL_NAME = None

        # ************************************************ #

        RUN_ID = DATASET + "-" + LANGUAGE_MODEL + "-" + LANGUAGE_FOR_MODEL

        # ************************************************ #

        RAND_SEED = 1234

        N_EPOCHS = 10

        if lm == 3:
            BATCH_SIZE = 2
        else:
            BATCH_SIZE = 4

        GRADIENT_ACC_STEPS = 8

        DROPOUT_RATE = 0.1

        LEARNING_RATE = 1e-5

        IMBALANCE_STRATEGY = 'loss_weight'

        IS_DEBUG_ENABLED = True

        DO_VALIDATION_SET = False

        # ************************************************ #

        dataset_labels = f"./data/{DATASET}/all-y.csv"

        all_x = np.squeeze(np.array(pd.read_csv(dataset_sentences, header=None, sep='\t\t', engine='python')))
        all_y = np.squeeze(np.array(pd.read_csv(dataset_labels, dtype=np.float32, header=None)))

        os.makedirs(f'./data/{DATASET}/{RUN_ID}', exist_ok=True)

        print(all_x.shape, all_y.shape)

        # ************************************************ #

        # Option 1: no external logging of the training metrics
        IS_LOGGING_ENABLED = False
        wandb_config = None

        # Option 2: external logging of the training metrics (for finetuned analysis and optimization of hyperparameters)
        # IS_LOGGING_ENABLED = True
        # wandb_config = { "project": "ipz-nlp", "entity": "mnbucher" }

        # ************************************************ #

        # random shuffling of loaded dataset
        set_seeds(RAND_SEED)
        idxs_shuffle = np.arange(all_x.shape[0])
        np.random.shuffle(idxs_shuffle)
        all_x = all_x[idxs_shuffle]
        all_y = all_y[idxs_shuffle]

        # test set
        # fix split with N=200
        N_SUBSET_FOR_B = 200
        mask_b = np.zeros(all_x.shape[0], dtype=bool)
        idxs_b = np.random.choice(np.arange(all_x.shape[0]), replace=False, size=N_SUBSET_FOR_B)
        mask_b[idxs_b] = True
        dataset_B_unlabelled_x = all_x[mask_b]
        dataset_B_unlabelled_y = all_y[mask_b]
        np.savetxt(f"./data/{DATASET}/{RUN_ID}/dataset-b-x.csv", dataset_B_unlabelled_x, "%s", encoding="utf-8") ### NEW
        np.savetxt(f"./data/{DATASET}/{RUN_ID}/dataset-b-y-true.csv", dataset_B_unlabelled_y, encoding="utf-8") ### NEW

        # training set
        mask_a = np.ones(all_x.shape[0], dtype=bool)
        mask_a[idxs_b] = False
        dataset_A_labelled_x = all_x[mask_a]
        dataset_A_labelled_y = all_y[mask_a]

        ## B: Ablation Study for Different Dataset Sizes

        RAND_SEED = 1234

        dataset_A_labelled_x_orig = dataset_A_labelled_x.copy()
        dataset_A_labelled_y_orig = dataset_A_labelled_y.copy()

        for ssratio in [50, 100, 200, 500, 1000]:

            set_seeds(RAND_SEED)

            idxs_sub = np.random.choice(dataset_A_labelled_x_orig.shape[0], ssratio)

            dataset_A_labelled_x = dataset_A_labelled_x_orig[idxs_sub]
            dataset_A_labelled_y = dataset_A_labelled_y_orig[idxs_sub]

            print("")
            print("dataset full size: ", all_x.shape)
            print("")
            print("dataset_B_unlabelled_x: ", dataset_B_unlabelled_x.shape)
            print("dataset_B_unlabelled_y: ", dataset_B_unlabelled_y.shape)
            print("")
            print("dataset_A_labelled_x: ", dataset_A_labelled_x.shape)
            print("dataset_A_labelled_y: ", dataset_A_labelled_y.shape)
            print("")

            dataset_B_unlabelled_y_preds_seed_1234 = train_and_predict_test(dataset_A_labelled_x, dataset_A_labelled_y, RUN_ID, N_EPOCHS, IMBALANCE_STRATEGY, dataset_B_unlabelled_x, learning_rate=LEARNING_RATE, dropout_rate=DROPOUT_RATE, batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACC_STEPS, rand_seed=RAND_SEED, language_model=LANGUAGE_MODEL, language_for_model=LANGUAGE_FOR_MODEL, custom_model_name=CUSTOM_MODEL_NAME, do_validation_set=DO_VALIDATION_SET, is_debug=IS_DEBUG_ENABLED, log_with_wandb=IS_LOGGING_ENABLED, wandb_config=wandb_config)
            np.savetxt(f"./output/{RUN_ID}_predictions-1234-sub-{ssratio}.csv", dataset_B_unlabelled_y_preds_seed_1234, fmt='%f')

            compute_and_print_metrics_for_dataset_b(dataset_B_unlabelled_y, [ dataset_B_unlabelled_y_preds_seed_1234 ], None, RAND_SEED, False, True, dataset_name = f'{RUN_ID}_{str(ssratio)}')