In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install rouge_score
!pip install loralib
!pip install peft

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, Trainer, TrainingArguments
import torch
import time
from tqdm import tqdm
import evaluate
import pandas as pd
import numpy as np

In [3]:
#--load dataset
dataset_name="knkarthick/dialogsum"
dataset = load_dataset(dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
#--load model
model_name="google/flan-t5-base"

original_model=AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.bfloat16)
tokenizer=AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [5]:
#--get trainable parameters
def print_model_trainable_parameters(model):

  trainable_model_params=0
  all_model_params=0
  for _,param in model.named_parameters():
    all_model_params+=param.numel()
    if param.requires_grad:
      trainable_model_params+=param.numel()
  print(f"trainable model parameters: {trainable_model_params}")
  print(f"all model parameters: {all_model_params}")
  print(f"percentage of trainable parameters: {100*trainable_model_params/all_model_params}")

In [6]:
print_model_trainable_parameters(original_model)

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable parameters: 100.0


## Test the model with zero shot inference

In [8]:
index=10

dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs=tokenizer(prompt,return_tensors="pt")
inputs

{'input_ids': tensor([[12198,  1635,  1737,     8,   826,  3634,  1713,   345, 13515,   536,
          4663,    10,  5574, 13753,     6,    48,    19,    21,    25,     6,
          7798,     5,  1713,   345, 13515,   357,  4663,    10,    27,    31,
            51,    78,  1095,    25,  1423,     6,   754,   369,    16,    11,
           777,     8,  1088,     5,  6656,    31,     7,   270,     6,    27,
            31,    51,   417,    25,    43,     3,     9,   207,    97,     5,
          1713,   345, 13515,   536,  4663,    10,  7798,     6,   164,    27,
            43,     3,     9,  5565,    12,    43,     3,     9,  2595,    28,
            25,    58,  1713,   345, 13515,   357,  4663,    10,  8872,     5,
          1713,   345, 13515,   536,  4663,    10,   100,    19,   310,  1627,
          1088,     5,  1713,   345, 13515,   357,  4663,    10,  2163,     6,
            25,    33,   373,  1012,    28,   921,     5,    11,    25,   320,
           182,  1134,   469,     5,  

In [9]:
# original_model.generate(
#     inputs['input_ids'],
#     max_length=200)[0]

In [10]:
output=tokenizer.decode(original_model.generate(
    inputs['input_ids'],
    max_length=200)[0],
                        skip_special_tokens=True
)

dash_line="-".join("" for x in range(100))

print(dash_line)
print(f'input prompt: \n{prompt}')
print(dash_line)
print(f'input human summary: \n{summary}')
print(dash_line)
print(f'model summary: \n{output}')

---------------------------------------------------------------------------------------------------
input prompt: 

Summarize the following conversation

#Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday

Summary:

---------------------------------------------------------------------------------------------------
input human summary: 
#Person1# attends Brian's birthday party. Bri

## Perform Full fine-tuning

In [7]:
#--preprocess text tokenization

def tokenize_function(example):

  start_prompt="Summarize the following conversation.\n\n"
  end_prompt="\n\nSummary :"
  prompt=[start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]

  example['input_ids']=tokenizer(prompt,padding='max_length',truncation=True,return_tensors='pt').input_ids
  example['labels']=tokenizer(example['summary'],padding='max_length',truncation=True,return_tensors='pt').input_ids


  return example

In [8]:
tokenize_datasets=dataset.map(tokenize_function,batched=True)
tokenize_datasets=tokenize_datasets.remove_columns(['id','topic','dialogue','summary',])
tokenize_datasets

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [9]:
tokenize_dataset=tokenize_datasets.filter(lambda example, index: index%100==0,with_indices=True)
tokenize_dataset

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})

In [10]:
print(f'shapes of datasets')
print(f'train: {tokenize_dataset["train"].shape}')
print(f'validation: {tokenize_dataset["validation"].shape}')
print(f'test: {tokenize_dataset["test"].shape}')

shapes of datasets
train: (125, 2)
validation: (5, 2)
test: (15, 2)


In [16]:
#-fine tuning the model with preprocessed dataset
output_dir=f'./dialogue_summary_full_fine_tuning_training-{str(int(time.time()))}'

# Training arguments with additional save parameters
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    save_steps=10,  # Save every 10 steps (or adjust as needed)
    save_total_limit=2,  # Only keep the last 2 checkpoints (or adjust as needed)
    max_steps=20  # Assuming you only want to train for 1 step for testing purposes
)

trainer=Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenize_dataset['train'],
    eval_dataset=tokenize_dataset['validation']
)

max_steps is given, it will override any value given in num_train_epochs


In [17]:
trainer.train()

Step,Training Loss
1,48.25
2,50.0
3,48.5
4,48.5
5,47.75
6,48.25
7,48.75
8,48.0
9,48.0
10,48.0


TrainOutput(global_step=20, training_loss=48.25, metrics={'train_runtime': 73.452, 'train_samples_per_second': 2.178, 'train_steps_per_second': 0.272, 'total_flos': 107506904334336.0, 'train_loss': 48.25, 'epoch': 1.25})

In [18]:
#--load full fine tuned model and evaluate
full_fine_model_path="/content/dialogue_summary_full_fine_tuning_training-1716531917/checkpoint-20"

full_fine_tuned_model=AutoModelForSeq2SeqLM.from_pretrained(full_fine_model_path)


## Test Full fine tuned model Qualitatively and compare with base model

In [19]:
index=200

dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs=tokenizer(prompt,return_tensors="pt")

In [20]:
inputs['input_ids'].to('cuda').device,original_model.device,full_fine_tuned_model.to('cuda').device

(device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0))

In [21]:
base_model_output=tokenizer.decode(original_model.generate(
    inputs['input_ids'].to('cuda'),
    max_length=200)[0],
                        skip_special_tokens=True
)

full_fine_tuned_model_output=tokenizer.decode(full_fine_tuned_model.generate(
    inputs['input_ids'].to('cuda'),
    max_length=200)[0],
                        skip_special_tokens=True
)


dash_line="-".join("" for x in range(100))

# print(dash_line)
# print(f'input prompt: \n{prompt}')
print(dash_line)
print(f'input human summary: \n{summary}')
print(dash_line)
print(f'base model summary: \n{base_model_output}')
print(dash_line)
print(f'full fine tuned model summary: \n{full_fine_tuned_model_output}')

---------------------------------------------------------------------------------------------------
input human summary: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
base model summary: 
You might want to upgrade your system.
---------------------------------------------------------------------------------------------------
full fine tuned model summary: 
#Person1#: I'm thinking of upgrading my computer.


## Quantitative Evaluation

In [12]:
rouge=evaluate.load('rouge')

In [23]:
dialogues=dataset['test'][0:10]['dialogue']
human_baseline_summaries=dataset['test'][0:10]['summary']

base_model_summaries=[]
full_fine_tuned_model_summaries=[]

for _,dialogue in enumerate(dialogues):

    prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:  """

    input_ids=tokenizer(prompt,return_tensors='pt').input_ids.to('cuda')

    original_model_outputs=original_model.generate(input_ids,
                                                   max_length=200)
    original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)
    base_model_summaries.append(original_model_text_output)


    full_fine_tuned_model_outputs=full_fine_tuned_model.generate(input_ids,
                                                   max_length=200)
    full_fine_tuned_model_text_output=tokenizer.decode(full_fine_tuned_model_outputs[0],skip_special_tokens=True)
    full_fine_tuned_model_summaries.append(full_fine_tuned_model_text_output)


zipped_summaries=list(zip(human_baseline_summaries,base_model_summaries,full_fine_tuned_model_summaries))

df=pd.DataFrame(zipped_summaries,columns=['human_baseline_summaries','base_model_summaries','full_fine_tuned_model_summaries'])

df

Unnamed: 0,human_baseline_summaries,base_model_summaries,full_fine_tuned_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I want to make sure that all employ...,The memo is to be distributed to all employees...
1,In order to prevent employees from wasting tim...,The memo will go to all employees by 4 pm.,The memo is to be distributed to all employees...
2,Ms. Dawson takes a dictation for #Person1# abo...,This memo should go as an intra-office memo to...,The memo is to be distributed to all employees...
3,#Person2# arrives late because of traffic jam....,#Person1: I'm so glad to hear that you finally...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic in this city is getting worse.,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,People are talking about their commute to work.,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"#Person1: Thanks Brian, I'm glad you like it. ...",Brian's birthday is coming up.


In [24]:
original_model_results=rouge.compute(
    predictions=base_model_summaries,
    references=human_baseline_summaries[0:len(base_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

full_fine_tuned_model_results=rouge.compute(
    predictions=full_fine_tuned_model_summaries,
    references=human_baseline_summaries[0:len(full_fine_tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print("original base model results :")
print(original_model_results)

print("full fine tuned model results :")
print(full_fine_tuned_model_results)

original base model results :
{'rouge1': 0.2581157109443554, 'rouge2': 0.09142028985507247, 'rougeL': 0.2046998512522809, 'rougeLsum': 0.2090927960927961}
full fine tuned model results :
{'rouge1': 0.26109650997150996, 'rouge2': 0.11055072463768116, 'rougeL': 0.2302777777777778, 'rougeLsum': 0.2339245014245014}


In [32]:
del original_model

In [34]:
del full_fine_tuned_model

## Perform PEFT

In [13]:
#--set peft parameters
from peft import get_peft_model, LoraConfig, TaskType

lora_config=LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model=get_peft_model(original_model,lora_config)

print(print_model_trainable_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable parameters: 1.4092820552029972
None


In [14]:
#--train peft model
peft_output_dir=f'./dialogue_summary_peft_training-{str(int(time.time()))}'

peft_train_config=TrainingArguments(
    output_dir=peft_output_dir,
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    save_steps=10,  # Save every 10 steps (or adjust as needed)
    save_total_limit=2,  # Only keep the last 2 checkpoints (or adjust as needed)
    max_steps=20
)

peft_trainer=Trainer(
    model=peft_model,
    args=peft_train_config,
    train_dataset=tokenize_dataset['train'],
    eval_dataset=tokenize_dataset['validation']
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
peft_trainer.train()

Step,Training Loss
1,49.25
2,48.0
3,51.25
4,48.5
5,49.0
6,49.5
7,48.25
8,48.0
9,51.0
10,50.25




TrainOutput(global_step=20, training_loss=49.4375, metrics={'train_runtime': 47.5381, 'train_samples_per_second': 3.366, 'train_steps_per_second': 0.421, 'total_flos': 109213751181312.0, 'train_loss': 49.4375, 'epoch': 1.25})

In [18]:
# peft_trainer.model.save_pretrained(peft_output_dir)
peft_trainer.model.save_pretrained("/content/dialogue_summary_peft_training-1716534128/checkpoint-20")




In [17]:
tokenizer.save_pretrained(peft_output_dir)

('./dialogue_summary_peft_training-1716534128/tokenizer_config.json',
 './dialogue_summary_peft_training-1716534128/special_tokens_map.json',
 './dialogue_summary_peft_training-1716534128/spiece.model',
 './dialogue_summary_peft_training-1716534128/added_tokens.json',
 './dialogue_summary_peft_training-1716534128/tokenizer.json')

In [25]:
#--load peft and original model
from peft import PeftModel, PeftConfig

# peft_model_path="/content/dialogue_summary_peft_training-1716531917/checkpoint-20"

base_model_for_peft=AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.bfloat16)
tokenizer=AutoTokenizer.from_pretrained(model_name)


peft_model=PeftModel.from_pretrained(base_model_for_peft,"/content/dialogue_summary_peft_training-1716534128/checkpoint-20",torch_dtype=torch.bfloat16,is_trainable=False)

In [26]:
print(print_model_trainable_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable parameters: 0.0
None


In [29]:
# peft_model

In [27]:
dialogues=dataset['test'][0:10]['dialogue']
human_baseline_summaries=dataset['test'][0:10]['summary']

base_model_summaries=[]
peft_model_summaries=[]

for _,dialogue in enumerate(dialogues):

    prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:  """

    input_ids=tokenizer(prompt,return_tensors='pt').input_ids.to('cuda')

    original_model_outputs=original_model.generate(input_ids,
                                                   max_length=200)
    original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)
    base_model_summaries.append(original_model_text_output)


    peft_model_outputs=peft_model.generate(input_ids)
    peft_model_text_output=tokenizer.decode(peft_model_outputs[0],skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)


zipped_summaries=list(zip(human_baseline_summaries,base_model_summaries,peft_model_summaries))

df=pd.DataFrame(zipped_summaries,columns=['human_baseline_summaries','base_model_summaries','peft_model_summaries'])

df

TypeError: PeftModelForSeq2SeqLM.generate() takes 1 positional argument but 2 were given

In [None]:
original_model_results=rouge.compute(
    predictions=base_model_summaries,
    references=human_baseline_summaries[0:len(base_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

full_fine_tuned_model_results=rouge.compute(
    predictions=full_fine_tuned_model_summaries,
    references=human_baseline_summaries[0:len(full_fine_tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print("original base model results :")
print(original_model_results)

print("full fine tuned model results :")
print(full_fine_tuned_model_results)

In [23]:
index=200

dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs=tokenizer(prompt,return_tensors="pt")

peft_model_output=tokenizer.decode(peft_model.generate(
    inputs['input_ids'].to('cuda'),
    max_length=200)[0],
                        skip_special_tokens=True
)


dash_line="-".join("" for x in range(100))

# print(dash_line)
# print(f'input prompt: \n{prompt}')
print(dash_line)
print(f'input human summary: \n{summary}')
print(dash_line)
print(f'peft model summary: \n{peft_model_output}')

TypeError: PeftModelForSeq2SeqLM.generate() takes 1 positional argument but 2 were given