# LLM Project To Build and Fine Tune a Large Language Model

The project uses Retrieval Augmented Generation (RAG) with OpenAI to build a knowledge-based online shopping chatbot. RAG reduces hallucinations and delivers reliable answers by integrating external information.


## Package Requirements


In [None]:
# Suppress all warning messages
import warnings

warnings.filterwarnings("ignore")

# Import PyTorch for deep learning, Evaluate for model evaluation metrics, and Time for performance timing.
import torch
import evaluate
import time

# Import Pandas for data manipulation and NumPy for numerical operations.
import pandas as pd

# Import load_dataset from Datasets library for loading and managing datasets.
from datasets import load_dataset

# Import various classes from Transformers library for NLP model loading, tokenization, configuration, training, and random seed setting.
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    GenerationConfig,
    TrainingArguments,
    Trainer,
)

# Import PEFT (Parameter-Efficient Fine-Tuning) classes for advanced model configuration and adaptation.
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

## Model Fine Tuning

### Load Dataset and LLM

In [None]:
# Set the device for PyTorch operations
DEVICE = "cpu"
torch_device = torch.device(DEVICE)

# Load the dataset from Hugging Face's datasets library
hugging_face_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(hugging_face_dataset_name)

# Load the original FLAN-T5 model and tokenizer from Hugging Face's Transformers library
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to calculate the number of trainable and total model parameters, and their percentage.
def number_of_trainable_model_parameters(model):
    trainable_model_params = 0  # Initialize count of trainable parameters
    all_model_params = 0  # Initialize count of all parameters
    for _, param in model.named_parameters():  # Iterate through all model parameters
        all_model_params += param.numel()  # Count total parameters
        if param.requires_grad:  # Check if parameter is trainable
            trainable_model_params += param.numel()  # Count trainable parameters

    # Calculate and format the result
    result = f"trainable model parameters: {trainable_model_params}\n"
    result += f"all model parameters: {all_model_params}\n"
    result += f"Percentage of model params: {(trainable_model_params / all_model_params) * 100}"
    return result

# Calculates and displays the count of trainable parameters, the count of all parameters,
# and the percentage of trainable parameters relative to all parameters in the model.
print(number_of_trainable_model_parameters(original_model))

### Test the Model With Zero Shot Inferencing

In [None]:
# Index of the dialogue to be summarized
index = 200

# Extract the dialogue and summary from the dataset
dialogue = dataset["test"][index]["dialogue"]
summary = dataset["test"][index]["summary"]

# Construct the prompt for summarizing the conversation
prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate a summary using the original model
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[
        0
    ],  # Generate a summary
    skip_special_tokens=True,  # Skip special tokens when decoding
)

# Print the prompt, baseline human summary, and model-generated summary
dash_line = "-".join("" for x in range(100))  # Create a dashed line
print(dash_line)
print(f"Input Prompt:\n{prompt}")  # Print the input prompt
print(dash_line)
print(f"Baseline Human Summary:\n{summary}\n")  # Print the baseline human summary
print(dash_line)
print(f"Model Generation - Zero Shot: \n{output}")  # Print the model-generated summary

### Perform Full Fine-Tuning

#### Preprocess the Dialog-Summary Dataset


In [None]:
# Tokenization function to prepare examples for sequence-to-sequence models.
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"  # Starting prompt for dialogue summarization
    end_prompt = "\n\nSummary: "  # Ending prompt for dialogue summarization
    # Construct prompts for each dialogue in the example
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    # Tokenize prompts and summaries
    example["input_ids"] = tokenizer(
        prompt, padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids
    example["labels"] = tokenizer(
        example["summary"], padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids
    return example

# Tokenize the dataset using the tokenize_function, batched for efficiency
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns from the tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(
    ["id", "topic", "dialogue", "summary"]
)

In [None]:
# Filter the tokenized dataset to keep examples with indices divisible by 100, retaining indices
tokenized_datasets = tokenized_datasets.filter(
    lambda example, index: index % 100 == 0, with_indices=True
)

In [None]:
# Print the shapes of the datasets
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

# Print the tokenized datasets
print(tokenized_datasets)

#### Fine-Tune the Model With the Preprocessed Dataset


In [None]:
# Define the output directory for saving training checkpoints and logs
output_dir = f"../models/dialogue-summary-training-{str(int(time.time()))}"

# Define the training arguments including output directory, learning rate, number of epochs, etc.
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    # Uncomment to use CPU
    bf16=True,
    use_cpu=True,
)

# Initialize the Trainer object for training the model
trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["validation"],  # Validation dataset
)

# Train the model using the Trainer object
trainer.train()

In [None]:
# Load the instructed model from a pre-trained checkpoint and move it to the specified device
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("../models/full/").to(
    torch_device
)

# Move the original model to the specified device
original_model = original_model.to(torch_device)

### Evaluate the Model Qualitatively

In [None]:
# Index of the dialogue to be summarized
index = 200

# Extract the dialogue and human baseline summary from the dataset
dialogue = dataset["test"][index]["dialogue"]
human_baseline_summary = dataset["test"][index]["summary"]

# Construct the prompt for summarizing the conversation
prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate summaries using the original and instructed models
original_outputs = original_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1),
)
original_text_output = tokenizer.decode(original_outputs[0], skip_special_tokens=True)
instruct_outputs = instruct_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1),
)
instruct_text_output = tokenizer.decode(instruct_outputs[0], skip_special_tokens=True)

# Print the input prompt, baseline human summary, and model-generated summaries
dash_line = "-".join("" for x in range(100))  # Create a dashed line
print(dash_line)
print(f"Input Prompt:\n{prompt}")  # Print the input prompt
print(dash_line)
print(
    f"Baseline Human Summary:\n{human_baseline_summary}\n"
)  # Print the baseline human summary
print(dash_line)
print(
    f"Original Model Generation - Zero Shot: \n{original_text_output}"
)  # Print the original model-generated summary
print(dash_line)
print(
    f"Instruct Model Generation - Fine Tune: \n{instruct_text_output}"
)  # Print the instructed model-generated summary

### Evaluate the Model Quantitatively (With Rouge Metric)

In [None]:
# Load the ROUGE evaluator
rouge = evaluate.load("rouge")

In [None]:
# Extract the dialogues from the dataset
dialogue = dataset["test"][0:10]["dialogue"]

# Extract the human baseline summaries from the dataset
human_baseline_summaries = dataset["test"][0:10]["summary"]

# Initialize lists to store summaries generated by the original and instructed models
original_model_summaries = []
instruct_model_summaries = []

# Generate summaries for each dialogue using both the original and instructed models
for _, dialogue in enumerate(dialogue):
    # Construct the prompt for summarizing the conversation
    prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
    """
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Generate summaries using the original model
    original_outputs = original_model.generate(
        input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200)
    )
    original_text_output = tokenizer.decode(
        original_outputs[0], skip_special_tokens=True
    )
    original_model_summaries.append(original_text_output)

    # Generate summaries using the instructed model
    instruct_outputs = instruct_model.generate(
        input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200)
    )
    instruct_text_output = tokenizer.decode(
        instruct_outputs[0], skip_special_tokens=True
    )
    instruct_model_summaries.append(instruct_text_output)

# Create a DataFrame to store human baseline, original model, and instructed model summaries
zipped_summaries = list(
    zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries)
)
df = pd.DataFrame(zipped_summaries, columns=["human", "original", "instruct"])

# Display the dataframe.
df

In [None]:
# Compute ROUGE scores for summaries generated by the original model compared to human baseline summaries
original_model_results = rouge.compute(
    predictions=original_model_summaries,  # Predicted summaries generated by the original model
    references=human_baseline_summaries,  # Reference (human baseline) summaries
    use_aggregator=True,  # Use aggregator for multiple references
    use_stemmer=True,  # Use stemmer to preprocess text
)

In [None]:
# Compute ROUGE scores for summaries generated by the instructed model compared to corresponding human baseline summaries
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,  # Predicted summaries generated by the instructed model
    references=human_baseline_summaries[
        0 : len(instruct_model_summaries)
    ],  # Corresponding human baseline summaries
    use_aggregator=True,  # Use aggregator for multiple references
    use_stemmer=True,  # Use stemmer to preprocess text
)

# Print the ROUGE results for both original and instructed models
print(
    f"Original Model: \n{original_model_results}"
)  # Print ROUGE results for the original model
print(
    f"Instruct Model: \n{instruct_model_results}"
)  # Print ROUGE results for the instructed model

## Parameter Efficient Fine-Tuning With LoRA



### Setup the PEFT/LoRA Model for Fine-Tunning

In [None]:
# Define the Lora configuration for a sequence-to-sequence LM task
lora_config = LoraConfig(
    r=32,  # Number of heads in the Lora module
    lora_alpha=32,  # Alpha parameter for Lora
    target_modules=["q", "v"],  # List of target modules for Lora attention
    lora_dropout=0.05,  # Dropout probability for Lora
    bias="none",  # Bias type for Lora
    task_type=TaskType.SEQ_2_SEQ_LM,  # Task type for the model
)

In [None]:
# Obtain the PEFT model by integrating the original model with the specified Lora configuration
peft_model = get_peft_model(original_model, lora_config)

# Print the number of trainable parameters in the PEFT model
print(number_of_trainable_model_parameters(peft_model))

In [None]:
# Define the output directory for saving training checkpoints and logs
output_dir = f"../models/peft-dialogue-summary-training-{str(int(time.time()))}"

# Define the training arguments including output directory, learning rate, number of epochs, etc.
training_args = TrainingArguments(
    auto_find_batch_size=True,  # Automatically find the batch size
    output_dir=output_dir,  # Output directory for saving checkpoints and logs
    learning_rate=1e-3,  # Learning rate for training
    num_train_epochs=100,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    logging_steps=1,  # Log training metrics every specified number of steps
    max_steps=1,  # Maximum number of training steps
    # Uncomment to use CPU
    bf16=True,
    use_cpu=True,
)

# Initialize the Trainer object for training the PEFT model
peft_trainer = Trainer(
    model=peft_model,  # PEFT model to be trained
    args=training_args,  # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["validation"],  # Validation dataset
)

# Train the PEFT model using the Trainer object
peft_trainer.train()

In [None]:
# Load the base model for PEFT fine-tuning
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load the PEFT model from the saved checkpoint
peft_model = PeftModel.from_pretrained(
    peft_model_base,  # Base model for PEFT fine-tuning
    "../models/peft/",  # Directory containing the PEFT fine-tuned model checkpoint
).to(
    torch_device
)  # Move the model to the specified device

# Move the original model to the specified device
original_model = original_model.to(torch_device)

# Define the index of the dialogue to summarize
index = 200

# Extract the dialogue and human baseline summary from the dataset
dialogue = dataset["test"][index]["dialogue"]
human_baseline_summary = dataset["test"][index]["summary"]

# Construct the prompt for summarizing the conversation
prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate a summary using the original model
original_outputs = original_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1),
)
original_text_output = tokenizer.decode(original_outputs[0], skip_special_tokens=True)

# Generate a summary using the PEFT fine-tuned model
peft_outputs = peft_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1),
)
peft_text_output = tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

# Print the dialogue, human baseline summary, and generated summaries
dash_line = "-".join("" for x in range(100))
print(dash_line)
print(f"Input Prompt:\n{prompt}")
print(dash_line)
print(f"Baseline Human Summary:\n{human_baseline_summary}\n")
print(dash_line)
print(f"Original Model Generation - Zero Shot: \n{original_text_output}")
print(dash_line)
print(f"PEFT Model Generation - Zero Shot: \n{peft_text_output}")

In [None]:

# Extract the dialogues and human baseline summaries from the dataset
dialogue = dataset["test"][0:10]["dialogue"]
human_baseline_summaries = dataset["test"][0:10]["summary"]

# Initialize lists to store the generated summaries from both original and PEFT models
original_model_summaries = []
peft_model_summaries = []

# Iterate through each dialogue and generate summaries using both models
for _, dialogue_text in enumerate(dialogue):
    # Construct the prompt for summarizing the conversation
    prompt = f"""
Summarize the following conversation

{dialogue_text}

Summary:
    """
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Generate a summary using the PEFT fine-tuned model
    peft_outputs = peft_model.generate(
        input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200)
    )
    peft_text_output = tokenizer.decode(peft_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_text_output)

# Combine the human baseline summaries with the generated summaries into a DataFrame
zipped_summaries = list(
    zip(human_baseline_summaries, original_model_summaries, peft_model_summaries)
)
df = pd.DataFrame(zipped_summaries, columns=["human", "original", "peft"])

# Compute ROUGE scores for the summaries generated by the PEFT model
peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0 : len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# Print the ROUGE scores for each model
print(f"Original Model Results: \n{original_model_results}")
print(f"Instruct Model Results: \n{instruct_model_results}")
print(f"PEFT Model Results: \n{peft_model_results}")