In [None]:
# connect to the local google driver
from google.colab import drive
drive.mount('/content/drive')

## BIDIRECTIONAL ENCODER REPRESENTATIONS FROM TRANSFORMERS (BERT)

 ## Parameter Efficient Fine-Tuning (PEFT):
  A form of instruction fine-tuning that is much more efficient than full fine-tuning. Training a language model, especially for full LLM fine-tuning, demands significant computational resources. Memory allocation is not only required for storing the model but also for essential parameters during training, presenting a challenge for simple hardware. PEFT addresses this by updating only a subset of parameters, effectively “freezing” the rest. This reduces the number of trainable parameters, making memory requirements more manageable and preventing catastrophic forgetting. Unlike full fine-tuning, PEFT maintains the original LLM weights, avoiding the loss of previously learned information. This approach proves beneficial for handling storage issues when fine-tuning for multiple tasks. There are various ways of achieving Parameter efficient fine-tuning. Low-Rank Adaptation LoRA & QLoRA are the most widely used and effective.

  ## QLoRA
  represents a more memory-efficient iteration of LoRA. QLoRA takes LoRA a step further by also quantizing the weights of the LoRA adapters (smaller matrices) to lower precision (e.g., 4-bit instead of 8-bit). This further reduces the memory footprint and storage requirements. In QLoRA, the pre-trained model is loaded into GPU memory with quantized 4-bit weights, in contrast to the 8-bit used in LoRA. Despite this reduction in bit precision, QLoRA maintains a comparable level of effectiveness to LoRA.

In [None]:
# install required libraries
# pip install -h
# https://pip.pypa.io/en/stable/cli/pip/ --- -q
# https://pip.pypa.io/en/stable/cli/pip_list/ ---- -u
!pip install -q -U bitsandbytes transformers datasets peft accelerate scipy einops evaluate trl rouge_score

In [None]:
!pip install python-dotenv

In [None]:
import pandas as pd
from huggingface_hub import login, whoami
from dotenv import load_dotenv
import os
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

from transformers import(
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    pipeline,
    set_seed,
    #whoami,
)
import time
from trl import SFTTrainer


In [None]:
import transformers
print(transformers.__version__)

In [None]:
from datasets import Dataset

In [None]:
import evaluate

In [None]:
seed = 42
set_seed(seed)

In [None]:
# # disable Weights and Biases
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# login to the hugging face
load_dotenv(dotenv_path = "/content/drive/MyDrive/Thesis/env/.env")
api_key = os.getenv("HUG_FACE_API_KEY")
login(api_key)

user = whoami()
if user.get('emailVerified'):
  print(f"{user.get('name')}...you logged successfully!!")


In [None]:
# load the dataset, convert it into the hugging face dataset
def load_data_convert_dataset(seed = 42):
  # load the data
  train_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/train/data_llm_fine_tune.csv')
  test_gold_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/test/CT24_checkworthy_english_test_gold.tsv', delimiter = '\t') # WO l
  test_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/test/CT24_checkworthy_english_test.tsv', delimiter = '\t')
  dev_test_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/dev/CT24_checkworthy_english_dev-test.tsv', delimiter = '\t')
  dev_df = pd.read_csv('/content/drive/MyDrive/Thesis/data/dev/CT24_checkworthy_english_dev.tsv', delimiter = '\t')

  # drop columns
  train_df = train_df.drop(columns = ['Unnamed: 0'])
  # test_gold_df = test_gold_df.drop(columns = ['Unnamed: 0'])
  # test_df = test_df.drop(columns = ['Unnamed: 0'])

  # convert to Dataset
  train_dataset = Dataset.from_pandas(train_df)
  test_gold_dataset = Dataset.from_pandas(test_gold_df) # WO Label
  test_dataset = Dataset.from_pandas(test_df) # W L
  eval_test_dataset = Dataset.from_pandas(dev_test_df) # WO L
  eval_dataset = Dataset.from_pandas(dev_df) #  W L

  # shuffel the dataset
  train_dataset = train_dataset.shuffle(seed=seed)
  test_gold_dataset = test_gold_dataset.shuffle(seed=seed)
  test_dataset = test_dataset.shuffle(seed=seed)
  eval_test_dataset = eval_test_dataset.shuffle(seed=seed)
  eval_dataset = eval_dataset.shuffle(seed=seed)

  return train_dataset, test_gold_dataset, test_dataset,eval_test_dataset, eval_dataset

In [None]:
# call the function and print the features of the dataest
# dataset quick start : https://huggingface.co/docs/datasets/en/quickstart
train_dataset, test_gold_dataset, test_dataset, eval_test_dataset, eval_dataset = load_data_convert_dataset()
print(train_dataset.features) ## Yes/No
print(test_gold_dataset.features) ## Yes/No
print(test_dataset.features) ## Without Lable
print(eval_test_dataset.features) ## Yes/No
print(eval_dataset.features) ## Yes/No

In [None]:
eval_dataset[1]

In [None]:
# 1. load dataset ---
# 2. create bitsandbytes configuration
# 3. load pretrained model
# 4. tokenization
# 5. preprocess dataset
# 6. prepare the model for QLoRA
# 7. set up PEFT for fine tuning
# 8. train PEFT adpater
# 9. evaluate the model

##########################

# 1. load and preprocess the dataset ----
# 2. tokenize the dataset
# 3. set up QLoRA
# 4. load model with QLoRA
# 5. Apply PEFT
# 6. Train PEFT Adpater
# 7. Evaluate the model and save

In [None]:
model_name = "bert-base-uncased"

In [None]:
# bnb config: https://huggingface.co/docs/peft/en/developer_guides/quantization
compute_dtype = getattr(torch, "float16") ##  compute_dtype = torch.float16
# create bitsandbytes configuration - package that provides a lightweight wrapper around custom CUDA functions that make LLMs go faster — optimizers, matrix multiplication, and quantization.
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True, # want to load the model in 4 bit format
    bnb_4bit_quant_type = "nf4", # 4 bit normal float
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = compute_dtype,
)


In [None]:
# print(compute_dtype)

In [None]:
# https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained
# load  pretrained model  using 4-bit quantization
original_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2, # binary classification
    quantization_config= bnb_config, # load model  using 4-bit quantization
    )

In [None]:
# original_model.config

In [None]:
# getting max length for embedding
# for generating input sequence of cinsistent length, which is beneficial for fine-tuning the language model by optimizing efficiency and minimizing computational overhead. It is essential to ensure that these sequences do not surpass the model’s maximum token limit.
def get_max_length(model):
  conf = model.config
  max_length = None
  for length_settings in ["n_positions", "max_position_embeddings", "seq_length"]:
    max_length = getattr(conf, length_settings, None)
    if max_length:
      print(f"Found max length: {max_length}")
      break
  if not max_length:
    max_length = 512
    print(f"Using declared max length: {max_length}")
  return max_length

In [None]:
get_max_length(original_model)

In [None]:
# https://huggingface.co/docs/transformers/v4.17.0/en/model_doc/bert#transformers.BertTokenizer
# bert is a model with absolute position embedding so it is usually advised to pad the inputs on the right  rather than the left
# tokenizer configuration
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({'pad_token':'[PAD]'})


In [None]:
tokenizer('America stands tall again, and as a result, we are credible.')

In [None]:
# preprocess--- tokenize the text in batch
def preprocess_text_in_batch(batch, tokenizer, max_length):
  return tokenizer(
      batch["Text"],
      max_length = max_length,
      truncation = True
  )

In [None]:
# preprocess -- preprocess the dataset
from functools import partial
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset, seed):
  print("start preprocessing.......")
  preprocessing_function = partial(preprocess_text_in_batch, max_length = max_length, tokenizer = tokenizer)

  dataset = dataset.map(
      preprocessing_function,
      batched = True,
  )
  dataset = dataset.shuffle(seed = seed)

  return dataset


In [None]:
max_length = get_max_length(original_model)
print(max_length)
train_dataset = preprocess_dataset(tokenizer, max_length, train_dataset, seed)
eval_dataset = preprocess_dataset(tokenizer, max_length, eval_dataset, seed)
test_dataset =  preprocess_dataset(tokenizer, max_length, test_dataset, seed)# WL
test_gold_dataset = preprocess_dataset(tokenizer, max_length, test_gold_dataset, seed) #WOL
eval_test_dataset = preprocess_dataset(tokenizer, max_length, eval_test_dataset, seed)

In [None]:
print(f"Training dataset size{train_dataset.shape}")
print(f"Validation dataset size{eval_dataset.shape}")
print(f"Test dataset size{test_gold_dataset.shape}")

In [None]:
eval_dataset[1]

In [None]:
#train_dataset[1]

In [None]:
# prepare model for kbit training :https://huggingface.co/docs/peft/en/developer_guides/quantization
# essential for QLoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
kbit_model = prepare_model_for_kbit_training(original_model)


In [None]:
# create LoRA config
# https://github.com/huggingface/peft/blob/v0.15.0/src/peft/tuners/lora/config.py#L199
# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    lora_dropout=0.1,
    bias = "none",
    task_type=TaskType.SEQ_CLS, # text classification is kind of sequence classification, predict the next word
)

In [None]:
# enable gradient checkpointing to reduce memory usage during fine tuning
kbit_model.gradient_checkpointing_enable()
# use the get_peft_model() function to create a PEFTModel from the quantized model and configuration
peft_model = get_peft_model(kbit_model, lora_config)

In [None]:
# see the trainable parameters
peft_model.print_trainable_parameters()

In [None]:
## rename column class_label to label
## https://huggingface.co/docs/datasets/v3.5.1/en/package_reference/main_classes#datasets.Dataset.rename_column
train_dataset = train_dataset.rename_column("class_label", "label")
eval_dataset = eval_dataset.rename_column("class_label", "label")
test_gold_dataset = test_gold_dataset.rename_column("class_label", "label")
eval_test_dataset = eval_test_dataset.rename_column("class_label", "label")
## map the label columns yes/no to 0/1
# https://huggingface.co/docs/datasets/en/about_map_batch
train_dataset = train_dataset.map(lambda x:{"label": 1 if x["label"] == "Yes" else 0}) ## nedd to retirn dictionary
# eval_dataset = eval_dataset.map(lambda x: {1 if x["label"] == "Yes" else 0}) ## it is returning set
eval_dataset = eval_dataset.map(lambda x:{"label": 1 if x["label"] == "Yes" else 0})
test_gold_dataset = test_gold_dataset.map(lambda x:{"label": 1 if x["label"] == "Yes" else 0})

In [None]:
# eval_dataset[0]
print(train_dataset.features)
print(eval_dataset.features)
print(test_gold_dataset.features)

In [None]:
print(f"Training dataset size{train_dataset.shape}")
print(f"Validation dataset size{eval_dataset.shape}")

In [None]:
# train PEFT adapter
# define training arguments and create Trainer instance
# it has saved in gdrive... next time should be in drive ...

def compute_metrics(eval_pred):
  metric = evaluate.load("f1")
  predictions_logit, labels = eval_pred
  print(f"predictions_logit: {predictions_logit}")
  print(f"labels: {labels}")
  predictions = predictions_logit.argmax(axis = -1)
  print(f"predictions: {predictions}")
  print(f"labels: {labels}")
  # accuracy = accuracy_score(labels, predictions)
  # precision = precision_score(labels, predictions)
  # recall = recall_score(labels, predictions)
  # f1_score = f1_score(labels, predictions)

  # return {
  #     "acciracy": accuracy,
  #     "precision": precision,
  #     "recall": recall,
  #     "f1": f1_score
  # }
  return {'f1': metric.compute(predictions=predictions, references=labels)}

output_dir = f'/content/drive/MyDrive/Thesis/fine-tuning/checkworthy-binary-classification-training-{str(int(time.time()))}'
# Train peft Adapter- define training arguments and create trainer instance
args = TrainingArguments(
    output_dir = output_dir,
    # overwrite_output_dir = True,
    # do_eval = True,
    # eval_strategy = "steps",
    # gradient_accumulation_steps = 4,
    # max_steps = 1000,
    # warmup_steps = 1,
    # logging_steps = 25,
    # save_strategy = "steps",
    # save_steps = 25,
    # eval_steps = 25,
    # # for full set of optimizers: https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
    # optim = "paged_adamw_8bit",
    # group_by_length = True,
    # report_to = None,
    # gradient_checkpointing = True,
    # # group_by_length  = True,
    # logging_dir = "./logs",
    # learning_rate = 2e-5,

    #### Fact Finder #######
    # num_train_epochs=3,
    # per_device_train_batch_size=2,
    # gradient_accumulation_steps=2,
    # logging_steps=25,
    # optim="paged_adamw_32bit",
    # eval_strategy="epoch",
    # learning_rate=2e-4,
    # bf16=False,
    # fp16=False,
    # weight_decay=0.001,
    # max_grad_norm=0.3, max_steps=-1, warmup_ratio=0.03, group_by_length=True,
    # #run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    # lr_scheduler_type='constant',
    # label_names = ["label"],
    #############################
    learning_rate = 2e-4,
    num_train_epochs = 10,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    #load_best_model_at_end = True,
    #metric_for_best_model = "f1",
    label_names = ["label"]
)
# https://huggingface.co/docs/peft/en/task_guides/lora_based_methods
peft_model.config.use_cache = False
# The Trainer extracts labels using the label column automatically
peft_trainer = Trainer(
    model = peft_model,
    train_dataset = train_dataset,
    args = args,
    # it could solve the issue for compute metrics: https://discuss.huggingface.co/t/why-do-i-get-no-validation-loss-and-why-are-metrics-not-calculated/32373
    # https://discuss.huggingface.co/t/why-do-i-get-no-validation-loss-and-why-are-metrics-not-calculated/32373
    #compute_metrics = compute_metrics,
    data_collator = DataCollatorWithPadding(tokenizer = tokenizer,) # pad_to_multiple_of = 8)
)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.Trainer.data_collator

In [None]:
from datetime import datetime
start_time = datetime.now()
print(f"Training started at: {start_time.strftime('%Y-%m-%d-%H-%M')}")
peft_trainer.train()
end_time = datetime.now()
print(f"Training ended at: {end_time.strftime('%Y-%m-%d-%H-%M')}")


In [None]:
output_dir = "/content/drive/MyDrive/Thesis/models/fine-tuned-bert-QLoRA"
peft_trainer.save_model(output_dir)

In [None]:
# change the test dataset text to tokenized text
# test_gold_dataset[0] # with label
# test_gold_dataset = test_gold_dataset.rename_column("class_label", "label")
# test_gold_dataset = test_gold_dataset.map(lambda x:{"label": 1 if x["label"] == "Yes" else 0})
# test_dataset[0] # without label
# eval_test_dataset # with label
# eval_dataset[0] # with label
# predictions = peft_trainer.predict(eval_dataset)
#eval_prediction
# when i test the dataest should i remove the label columns
# also do in the evaluation dataset
# make a seperate dataset with sentence id and labels for both validation and test
# then compare with the prediction labels
# test the model
# Just run trainer.predict on your eval/test dataset.


In [None]:
# eval_dataset[0]

In [None]:
# score on eval dataset
eval_prediction = peft_trainer.predict(eval_dataset)
logits = eval_prediction.predictions[1]
model_predictions_eval = logits.argmax(axis = -1)
print(model_predictions_eval)
# get the true label in array for validation dataset
true_labels_eval = eval_dataset['label']
print(true_labels_eval)


In [None]:
# test data
# drop the label column
# test_prediction = peft_trainer.predict(test_gold_dataset)
# logits = test_prediction.predictions[1]
# model_predictions_test = logits.argmax(axis = -1)
# print(model_predictions_test)

# true_labels_test = test_gold_dataset["label"]
# print(true_labels_test)

In [None]:


f1_score_eval = f1_score(true_labels_eval, model_predictions_eval)
accuracy_score_eval = accuracy_score(true_labels_eval, model_predictions_eval)
precision_score_eval = precision_score(true_labels_eval, model_predictions_eval)
recall_score_eval = recall_score(true_labels_eval, model_predictions_eval)

print("Scores in evaluation dataset")
print(f"Accuracy: {accuracy_score_eval}")
print(f"F1: {f1_score_eval}")
print(f"Precision: {precision_score_eval}")
print(f"Recall: {recall_score_eval}")
print("\n\n")
# 12:00

In [None]:
# # print(test_prediction.predictions[1])
# logits = test_prediction.predictions[1]
# model_predictions_test = logits.argmax(axis = -1)
# print(model_predictions_test)


# f1_score_test = f1_score(true_labels_test, model_predictions_test)
# accuracy_score_test = accuracy_score(true_labels_test, model_predictions_test)
# precision_score_test = precision_score(true_labels_test, model_predictions_test)
# recall_score_test = recall_score(true_labels_test, model_predictions_test)

# print("Scores in test dataset")
# print(f"Accuracy score in test dataset: {accuracy_score_test}")
# print(f"F1 Score in test dataest: {f1_score_test}")
# print(f"Precision score in test dataset: {precision_score_test}")
# print(f"Recall score in test dataset{recall_score_test}")

In [None]:
count = 0
for i in test_prediction.predictions:
  print(i.shape)
  count += 1
print(count)

In [None]:
print(np.argmax([-3.3208096,  3.2438753], axis = -1))

In [None]:
# array = np.array(eval_prediction.predictions)

In [None]:
# prediction_array = np.array(eval_prediction.predictions)

In [None]:
# pred_class = np.argmax(eval_prediction.predictions,axis = -1)