In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

W0901 17:04:45.527000 94866 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


# Loading HF dataset

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
example_indices = [90, 270]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: What's wrong with you, Mr. Polly?
#Person2#: What's wrong? I want a break from this horrible job.
#Person1#: Then, buy a bottle of soft drink.
#Person2#: Would you like to buy a bottle for me in the shop?
#Person1#: It's a problem, because my boss is in that shop now.
#Person2#: Ok, I will go there myself.
#Person1#: Sorry, Mr. Polly.
#Person2#: It doesn't matter. Oh, God, I have only four dollars in my wallet. Is that possible for me to buy one?
#Person1#: Have a try.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Mr. Polly is tired and wants a break from work. #Person1# cannot buy a bottle of soft drink for him.
---------------------------------------------------------------

# Loading base hf model

In [4]:
model_name='google/flan-t5-small'

original_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float32, 
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


# Summarization with base hf model

In [5]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

device = torch.device("cpu")
original_model = original_model.to(device)

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

# Tokenizing the loaded dataset

In [6]:
# preprocess the prompt-response dataset into tokens and pull out their input_ids (1 per token).
def tokenize_function(example):
    
    start_prompt = 'Summarize the following article.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for story in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

#To save some time in the lab, you will subsample the dataset
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 10 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (1246, 2)
Test: (150, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1246
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 150
    })
})


# Full Fine-Tuning of the Original Model

In [7]:
# Fully fine-tuning the entire original_model
fully_output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=fully_output_dir,
    learning_rate=1e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=20,
    save_strategy='epoch'
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)
trainer.train() #this will throw out of memory error due to memory constraints

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,57.3701
2,53.5271
3,50.5227
4,51.1259
5,48.7397
6,47.2531
7,47.0078
8,46.2312
9,44.2352
10,43.8756


TrainOutput(global_step=20, training_loss=45.55965805053711, metrics={'train_runtime': 221.5144, 'train_samples_per_second': 0.722, 'train_steps_per_second': 0.09, 'total_flos': 29742480752640.0, 'train_loss': 45.55965805053711, 'epoch': 0.1282051282051282})

# LoRA Fine-Tuning of the Original Model

In [8]:
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig

# Define a LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],  # for T5, "q" and "v" usually work well
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

# Wrap your base model with LoRA
model_with_lora = get_peft_model(original_model, lora_config)
model_with_lora.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


# LoRA params setting

| Parameter                             | Description                                                                                                                                                                                                                                                                                                          |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **`r=8`**                             | **Rank of the LoRA matrix.** This determines the size of the low-rank adaptation. A smaller `r` means fewer trainable parameters. Higher values can give better performance but increase memory use.                                                                                                                 |
| **`lora_alpha=32`**                   | **Scaling factor** for the LoRA weights. The actual adaptation is scaled by `lora_alpha / r`. This controls how much the LoRA weights affect the model.                                                                                                                                                              |
| **`target_modules=["q", "v"]`**       | **Names of submodules in the base model to which LoRA will be applied.** <br>For T5 (or similar Transformer models), `"q"` = query projection, `"v"` = value projection in attention. These are often most effective for adaptation.                                                                                 |
| **`lora_dropout=0.1`**                | **Dropout rate applied to LoRA layers during training.** Helps prevent overfitting. This dropout is **only applied to the LoRA weights**, not the full model.                                                                                                                                                        |
| **`bias="none"`**                     | Controls how biases are handled. Options: <br>– `"none"` (no bias adaptation) <br>– `"all"` (adapt all biases) <br>– `"lora_only"` (adapt only biases in LoRA layers). `"none"` is often sufficient.                                                                                                                 |
| **`task_type=TaskType.SEQ_2_SEQ_LM`** | Specifies the **type of task**, which tells PEFT how to apply LoRA properly. Options include: <br>– `SEQ_CLS` (sequence classification) <br>– `SEQ_2_SEQ_LM` (e.g., T5, BART) <br>– `CAUSAL_LM` (e.g., GPT) <br>– `TOKEN_CLS`, `QUESTION_ANS`, etc. <br>This ensures LoRA adapts the right components for your task. |


| Parameter            | Typical Range / Advice                                                                                    | When to Increase                                                      | When to Decrease                                                  |
| -------------------- | --------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- | ----------------------------------------------------------------- |
| **`r`** (rank)       | `4 – 16` (default: 8) <br> `8` is a good balance for most T5-based models.                                | If your model is large and your dataset is complex.                   | If you're memory-constrained or dataset is small.                 |
| **`lora_alpha`**     | `r × 4` (default: 32 when `r=8`) <br> Rule of thumb: set it proportional to `r`.                          | If the model underfits (too conservative changes).                    | If you observe overfitting or instability.                        |
| **`target_modules`** | For **T5**, use `["q", "v"]` (query, value). <br> Optional: include `"k"` and `"o"` for more flexibility. | If LoRA impact is too small. Try `["q", "k", "v", "o"]`.              | Keep minimal (`["q", "v"]`) to save memory and avoid overfitting. |
| **`lora_dropout`**   | `0.0 – 0.3` <br> Default: `0.1`. Use small dropout for stable training.                                   | If dataset is noisy or very small (to prevent overfitting).           | If underfitting on clean and large datasets.                      |
| **`bias`**           | `"none"` (most common) <br> `"lora_only"` if you suspect bias is important.                               | Rarely needed. Try `"all"` if performance is poor.                    | Stick with `"none"` to avoid unnecessary complexity.              |
| **`task_type`**      | Depends on model type: <br>Use `SEQ_2_SEQ_LM` for T5, BART; <br>`CAUSAL_LM` for GPT.                      | ❌ Never change this arbitrarily — must match your model architecture. | N/A                                                               |


In [9]:
peft_output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=peft_output_dir,
    auto_find_batch_size=True,
    save_strategy='epoch',
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=25,
    logging_steps=1,
    max_steps=20    
)
    
peft_trainer = Trainer(
    model=model_with_lora,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['validation']
)

peft_trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,41.6174
2,39.1059
3,38.125
4,36.6704
5,34.4656
6,33.4901
7,32.3168
8,31.4906
9,30.3273
10,28.5352


TrainOutput(global_step=20, training_loss=29.77527208328247, metrics={'train_runtime': 229.1088, 'train_samples_per_second': 0.698, 'train_steps_per_second': 0.087, 'total_flos': 29911595089920.0, 'train_loss': 29.77527208328247, 'epoch': 0.1282051282051282})

# Comparision between Original vs Full Fine-Tuning vs LoRA Fine-Tuning model

In [10]:
index = 0
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""
###################
# input Processing 
###################
input_ids = tokenizer(prompt, return_tensors='pt')["input_ids"]

###############################
# Original Model summarization
###############################
device = torch.device("cpu")
original_model = original_model.to(device)

original_model_outputs = original_model.generate(
    input_ids, 
    max_new_tokens=200,
)
original_model_text_output = tokenizer.decode(
    original_model_outputs[0], 
    skip_special_tokens=True
)

#######################################
# (Fully) Instruct Model summarization
#######################################
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"./{fully_output_dir}/checkpoint-20",
    torch_dtype=torch.float32,
    local_files_only=True
)

instruct_model_outputs = instruct_model.generate(
    input_ids=input_ids, 
    max_new_tokens=200,
)
instruct_model_text_output = tokenizer.decode(
    instruct_model_outputs[0], 
    skip_special_tokens=True
)

#######################################
# (LoRA) Instruct Model summarization
#######################################
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float32, 
    device_map='auto'
)
device = torch.device("cpu")
base_model = base_model.to(device)

peft_model = PeftModel.from_pretrained(
    base_model,
    f'./{peft_output_dir}/checkpoint-20', 
    torch_dtype=torch.float32,
    is_trainable=False
)
peft_model_outputs = peft_model.generate(
    input_ids=input_ids, 
    max_new_tokens=200,
)
peft_model_text_output = tokenizer.decode(
    peft_model_outputs[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: \n{peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Employees should be allowed to use instant messaging and instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. Employees should be allowed to use instant messaging to communicate with clients. 

In [11]:
rouge = evaluate.load('rouge')
dialogues = dataset['test'][0:20]['dialogue']
human_baseline_summaries = dataset['test'][0:20]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Original model
    original_model_outputs = original_model.generate(
        input_ids, 
        max_new_tokens=200,
    )
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    # Fully Instruct model
    instruct_model_outputs = instruct_model.generate(
        input_ids=input_ids, 
        max_new_tokens=200,
    )
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

    # PEFT model
    peft_model_outputs = peft_model.generate(
        input_ids=input_ids, 
        max_new_tokens=200,
    )
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)
    
# Combine summaries
zipped_summaries = list(zip(
    human_baseline_summaries,
    original_model_summaries,
    instruct_model_summaries,
    peft_model_summaries
))

df = pd.DataFrame(
    zipped_summaries,
    columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries']
)
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,I'm apprehensive about the new policy.,Is this the first time you have a dictation fo...,I am going to send you a memo to all employees.
1,In order to prevent employees from wasting tim...,Taking a dictation for the department heads.,Is this the first time you have a dictation fo...,I am going to send you a memo to all employees.
2,Ms. Dawson takes a dictation for #Person1# abo...,"Ms. Dawson, I need you to take a dictation for...",Is this the first time you have a dictation fo...,I am going to send you a memo to all employees.
3,#Person2# arrives late because of traffic jam....,Taking the subway would be a good idea for me.,Talk to your boss.,Talk to your boss.
4,#Person2# decides to follow #Person1#'s sugges...,I'm a little worried about the traffic jam.,Talk to your boss.,Talk to your boss.
5,#Person2# complains to #Person1# about the tra...,A lot of people are driving to work.,Talk to your boss.,Talk to your boss.
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,"Kate, you know, I'm not sure.",Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,I'm not sure if they're going to divorce the n...,"Kate, you know, I'm not sure.",Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are divorced.,"Kate, you know, I'm not sure.",Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"Happy Birthday, Brian.","Brian, how are you?","Brian, how are you?"


In [12]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print("ORIGINAL MODEL:")
print(original_model_results)

print("INSTRUCT MODEL:")
print(instruct_model_results)

print("PEFT MODEL:")
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.16791136583794442), 'rouge2': np.float64(0.04915458937198068), 'rougeL': np.float64(0.14176375516917944), 'rougeLsum': np.float64(0.14187898607250704)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.12371533120463277), 'rouge2': np.float64(0.014646464646464647), 'rougeL': np.float64(0.1051203074619996), 'rougeLsum': np.float64(0.10536452327395669)}
PEFT MODEL:
{'rouge1': np.float64(0.16548854096663573), 'rouge2': np.float64(0.05218840579710145), 'rougeL': np.float64(0.15946482954824476), 'rougeLsum': np.float64(0.15938968175376172)}


# Vector Database

In [13]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example texts
texts = [
    "I love machine learning.",
    "FAISS is great for vector similarity search.",
    "Let's store text embeddings in a database.",
    "Python is a versatile language.",
]

# Convert to vectors
embeddings = model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
import faiss
import numpy as np

dimension = embeddings.shape[1]  # e.g., 384
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean distance

# Add vectors to index
index.add(np.array(embeddings))

In [15]:
query = "How do I find similar texts?"
query_vector = model.encode([query])

# Search for top 3 similar texts
distances, indices = index.search(query_vector, k=3)

print("Query:", query)
print("\nTop matches:")
for i, idx in enumerate(indices[0]):
    print(f"{i+1}. {texts[idx]} (Distance: {distances[0][i]:.4f})")

Query: How do I find similar texts?

Top matches:
1. FAISS is great for vector similarity search. (Distance: 0.9444)
2. Let's store text embeddings in a database. (Distance: 1.2533)
3. I love machine learning. (Distance: 1.7097)


In [16]:
query = "How to embedded data?"
query_vector = model.encode([query])

# Search for top 3 similar texts
distances, indices = index.search(query_vector, k=3)

print("Query:", query)
print("\nTop matches:")
for i, idx in enumerate(indices[0]):
    print(f"{i+1}. {texts[idx]} (Distance: {distances[0][i]:.4f})")

Query: How to embedded data?

Top matches:
1. Let's store text embeddings in a database. (Distance: 0.9141)
2. I love machine learning. (Distance: 1.7842)
3. FAISS is great for vector similarity search. (Distance: 1.8112)


In [17]:
texts

['I love machine learning.',
 'FAISS is great for vector similarity search.',
 "Let's store text embeddings in a database.",
 'Python is a versatile language.']

In [19]:
# Save index
faiss.write_index(index, "vector_and_bm25_dbs/text_index.faiss")

# Later load it
index = faiss.read_index("vector_and_bm25_dbs/text_index.faiss")