# Building a chatbot on UCF FAQ data by fine-tuning GPT2 model
### Installing accelerate and Parameter efficient fine-tuning library

In [1]:
!pip install --upgrade accelerate
!pip install peft

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.9.0


### Importing necessary packages

In [2]:
# we upgraded `accelerate` just because to import Trainer API
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, PeftModel, PeftConfig, LoraConfig, TaskType
from glob import glob
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

2024-03-19 02:38:56.386825: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 02:38:56.386946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 02:38:56.519982: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Initializing global constant variables

In [10]:
MODEL_NAME = "gpt2"
DATASET_NAME = "/kaggle/input/faq-ucf/faqsUcfDataset.csv"
SPLIT = 0.2
MAX_LENGTH = 256
EPOCHS = 150

### Defining a DataPreprocessor class that will:
#### 1. Load dataset
#### 2. Preprocess the dataset
#### 3. Initialize an AutoTokenizer and use it to tokenize the dataset
#### 4. Include labels to the dataset

In [4]:
class DataPreprocessor:
    def __init__(self, model_name, max_length):
        # Initialize the DataPreprocessor with the specified model name and max_length
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = "<pad>"
    
    def load_dataset(self, dataset_name, split):
        # Load the dataset using the specified split
        try:
            print(f"Loading {dataset_name} dataset...")
            df = pd.read_csv(dataset_name)
            train_ds, validation_ds = train_test_split(df, test_size=SPLIT)
            train_ds = train_ds.reset_index()
            validation_ds = validation_ds.reset_index()
            print(f"Loaded {dataset_name} dataset.")
            return train_ds, validation_ds
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None, None
    
    def preprocess(self, dataset):
        print("Preprocessing dataset...")
        try:
            # Preprocess the dataset by combining question and answer text
            dataset["text"] = dataset["Q"] + " " + dataset["A"]
            # Drop original columns
            preprocessed_dataset = dataset.drop(['Q', 'A'], axis=1)
            print("Dataset preprocessing completed.")
            return preprocessed_dataset
        except Exception as e:
            print(f"Error preprocessing dataset: {e}")
            return None
    
    def tokenize(self, dataset):
        # Tokenize the dataset
        print("Tokenizing dataset...")
        def tokenize_function(examples):
            return self.tokenizer(examples["text"], max_length=self.max_length, truncation=True, padding="max_length")
        
        try:
            tokenized_dataset = dataset.apply(tokenize_function, axis=1)
            print("Dataset tokenization completed.")
            return tokenized_dataset
        except Exception as e:
            print(f"Error tokenizing dataset: {e}")
            return None
    
    def add_labels(self, dataset):
        # Add labels to the dataset
        print("Adding labels to dataset...")
        def copy_input_ids(example):
            example["labels"] = example["input_ids"].copy()
            return example
        
        try:
            labeled_dataset = dataset.apply(copy_input_ids)
            print("Labels added to dataset.")
            return labeled_dataset
        except Exception as e:
            print(f"Error adding labels to dataset: {e}")
            return None
    
    def preprocess_pipeline(self, dataset_name, split):
        # Execute the preprocessing pipeline
        train_ds, validation_ds = self.load_dataset(dataset_name, split)
        if train_ds is None or validation_ds is None:
            # Dataset loading failed, return None
            print("Preprocessing pipeline aborted due to dataset loading error.")
            return None, None
        
        train_ds = self.preprocess(train_ds)
        validation_ds = self.preprocess(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset preprocessing failed, return None
            print("Preprocessing pipeline aborted due to dataset preprocessing error.")
            return None, None
        
        train_ds = self.tokenize(train_ds)
        validation_ds = self.tokenize(validation_ds)
        if train_ds is None or validation_ds is None:
            # Dataset tokenization failed, return None
            print("Preprocessing pipeline aborted due to dataset tokenization error.")
            return None, None
        
        train_ds = self.add_labels(train_ds)
        validation_ds = self.add_labels(validation_ds)
        if train_ds is None or validation_ds is None:
            # Adding labels failed, return None
            print("Preprocessing pipeline aborted due to label addition error.")
            return None, None
        
        return train_ds, validation_ds

### Initializing a DataPreprocessor object and using it to load and preprocess UCF_FAQ dataset

In [5]:
# DataPreprocessor instance
preprocessor = DataPreprocessor(MODEL_NAME, MAX_LENGTH)
# Obtaining train, val datasets and tokenizing them
tokenized_train_ds, tokenized_validation_ds = preprocessor.preprocess_pipeline(DATASET_NAME, SPLIT)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading /kaggle/input/faq-ucf/faqsUcfDataset.csv dataset...
Loaded /kaggle/input/faq-ucf/faqsUcfDataset.csv dataset.
Preprocessing dataset...
Dataset preprocessing completed.
Preprocessing dataset...
Dataset preprocessing completed.
Tokenizing dataset...
Dataset tokenization completed.
Tokenizing dataset...
Dataset tokenization completed.
Adding labels to dataset...
Labels added to dataset.
Adding labels to dataset...
Labels added to dataset.


### Initializing pretrained GPT2 model

In [6]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Initializing Low Rank Adaptation configurations
### Advantages of using LoRA :
- Trainable parameter reduction -> Less data is required for fine-tuning
- Faster training
- Uses less memory
- PEFT library makes it look so easy!

In [7]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # task_type, token classification (TaskType.CAUSAL_LM)
    inference_mode=False,
    r=8,                           # r, the dimension of the low-rank matrices
    lora_alpha=16,                 # lora_alpha, scaling factor for the weight matrices
    lora_dropout=0.3,              # lora_dropout, dropout probability of the LoRA layers
    fan_in_fan_out=True,
    bias="lora_only"               # bias, set to only lora layers to train
    
)

### Converting our pretrained GPT2 model to lora_model

In [8]:
# Converting our model to lora
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.23643136409814364


### Initializing training arguments for fine-tuning job

In [11]:
training_args = TrainingArguments(
    "gpt2-on-ucf-faq",
    # Number of epochs to train
    num_train_epochs=EPOCHS,
    # Training batch size
    per_device_train_batch_size=32,
    # Validation batch size
    per_device_eval_batch_size=32,
    # Number of parallel workers
    dataloader_num_workers=2,

    # Evaluating our model based on number of steps
    evaluation_strategy = "steps",
    logging_strategy="steps",
    save_strategy="steps",
    # Evaluate, log, and save the model every 150 steps
    eval_steps=300,
    logging_steps=300,
    save_steps=300,

    learning_rate=1e-3,
    weight_decay=0.01,
    save_total_limit=10,
    report_to='none',

    # Enabling the model to rollback to best checkpoint
    load_best_model_at_end=True,
)

### Building a trainer object on our model and training_arguments

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_validation_ds,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


### Training/Fine-tuning our lora-gpt2 model on UCF_FAQ dataset

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
300,1.1907,0.9441
600,0.8468,0.91769
900,0.723,0.925606
1200,0.6601,0.932816
1500,0.6272,0.932495


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1500, training_loss=0.8095712788899739, metrics={'train_runtime': 1225.5114, 'train_samples_per_second': 36.475, 'train_steps_per_second': 1.224, 'total_flos': 5860125337190400.0, 'train_loss': 0.8095712788899739, 'epoch': 150.0})


## Inference

In [19]:
# Create a test prompt
prompt = "How long does it take for an electronic transcript to arrive at UCF?"
# Encode the prompt
encoded_prompt = preprocessor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
# Transform the encoded representation to the device the model was trained on
encoded_prompt = encoded_prompt.to(trainer.model.device)

# Generating a prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    # Maximum length of predicted sequence
    max_length=MAX_LENGTH,
    min_length=1,
    # Temperature determines how creative/random or strict the model should be
    temperature=.9,
    # Determines the probability sum of tokens that should be kept
    top_p=.95,
    do_sample=True,
    # Predict 3 sequences
    num_return_sequences=3,
    pad_token_id=preprocessor.tokenizer.pad_token_id,
)

generated_sequences = []

# Decoding the predicted sequences
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = preprocessor.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
    generated_sequences.append(text.strip().split('?')[1])


In [20]:
generated_sequences

[' Electronic transcript processing takes approximately 5-10 business days for most UCF institutions. It takes approximately 5-10 business days to process an electronically submitted transcript.',
 ' Electronic transcripts typically arrive at the College of Graduate Studies within five to ten business days from the date of receipt of your application. If your transcript arrives at the College of Graduate Studies within five business days after receipt of your application, it may be delayed by up to 10 business days. If your transcript arrives at the College of Graduate Studies within seven business days after receipt of your application, it may be delayed by up to 10 business days.',
 ' Electronic transcripts usually arrive at the Office of Undergraduate Admissions at approximately five to ten business days after being sent from the institution of origin. If you receive an electronic transcript within five business days of being sent from the institution of origin, it may take up to fi

## Running our chatbot

In [21]:
while True:
    prompt = input("User > ")
    if prompt:
        if prompt == "bye":
            break
        encoded_prompt = preprocessor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
        encoded_prompt = encoded_prompt.to(trainer.model.device)

        # prediction
        output_sequences = trainer.model.generate(
            input_ids=encoded_prompt,
            max_length=MAX_LENGTH,
            min_length=1,
            temperature=.9,
            top_p=.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=preprocessor.tokenizer.pad_token_id,
        )

        generated_sequences = []

        # decode prediction
        for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
            generated_sequence = generated_sequence.tolist()
            text = preprocessor.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
            generated_sequences.append(text.strip().split("?")[1])
        print("Assistant > ", generated_sequences[0])

User >  When should I apply to UCF?


Assistant >   If you are applying to UCF, you should notify your Undergraduate Admissions Committee of any changes in your application, including an updated application, a revised transcript, and/or alternate transcript. Questions about transferring may be directed to the dean’s Undergraduate Admissions Committee at: http://ucf.edu/program/program-terms/.


User >  Does UCF have on-campus housing?


Assistant >   Campsites and housing are on campus and located in four geographic areas: East, North, South, Central and Western. Campus housing can also be located in the Central, Central and Western areas, while the Campus Center is located in central, Central and Western.


User >  What is the admission rate of UCF?


Assistant >   The U.S. Department of Higher Education (USD) requires a minimum admission fee of $12.75 for every class of admission, which includes all classes of students admitted to UCF during the same term (including graduation and transfer). If your class of admission meets the minimum admission criteria, you will receive a $8.75 fee. The minimum application fee is $50. After your application has been processed, a $20 fee is applied to your credit report. If you are applying for another term, the maximum application fee is $100. The minimum application fee is due at the close of term if approved. If you do not receive an acknowledgement from the administration within 15 business days of receipt of receipt of payment, you are not eligible for higher scholarships or fellowships. If you choose to receive a scholarship, the award must be sent or credited to your account within 15 business days after receipt of payment. Transfer students must submit an application electronically to UCF.

User >  bye
