<a href="https://colab.research.google.com/github/mosahle7/ML/blob/main/W2L1_Finetune_LLM_for_Dialogue_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

**Load Dataset and LLM**

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
model_name = 'google/flan-t5-base'
og_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [None]:
def num_train_paras(model):
  train_paras  = 0
  all_model_paras = 0
  for _, param in model.named_parameters():
    all_model_paras+=param.numel()
    if param.requires_grad:
      train_paras+=param.numel()
  return f"trainable parameters: {train_paras}\nall model parameters: {all_model_paras}"

In [None]:
print(num_train_paras(og_model))

trainable parameters: 247577856
all model parameters: 247577856


**Test Model with ZSI**

In [None]:
ind = 200
dialogue = dataset['test'][ind]['dialogue']
summary = dataset['test'][ind]['summary']

prompt = f"""
Summarize the following conversation:

{dialogue}

Summary:
"""

In [None]:
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    og_model.generate(
      inputs["input_ids"],
      max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation:

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [None]:
inputs

{'input_ids': tensor([[12198,  1635,  1737,     8,   826,  3634,    10,  1713,   345, 13515,
           536,  4663,    10,  2114,    25,  1702, 21066,    39,   358,    58,
          1713,   345, 13515,   357,  4663,    10,  2163,     6,    68,    27,
            31,    51,    59,   417,   125,  1776,    27,   133,   174,     5,
          1713,   345, 13515,   536,  4663,    10,   148,   228,  1099,  2651,
             3,     9,  3924,   478,    12,    39,   889,     5,    94,   133,
           995,    25,    12,   143,    95,    39,   293,  3971,   277,    11,
         11662,     7,    21,  3662,     5,  1713,   345, 13515,   357,  4663,
            10,   466,   133,    36,     3,     9,     3, 14339,  4023,     5,
          1713,   345, 13515,   536,  4663,    10,   148,   429,    92,   241,
            12,  5941,    39,  4214,   250,    34,    19,  1134, 21643,   230,
             5,  1713,   345, 13515,   357,  4663,    10,   571,    54,    62,
           103,    24,    58,  1713,  

**Full Fine-Tuning**

**Preprocess Dataset**

In [None]:
def tokenize_fun(eg):
  start_prompt = 'Summarize the following conversation: \n\n'
  end_prompt = '\n\nSummary: '
  prompt = [start_prompt+dialogue+end_prompt for dialogue in eg["dialogue"]]
  eg['input_ids'] = tokenizer(prompt, padding="max_length",truncation=True, return_tensors="pt").input_ids
  eg['labels'] = tokenizer(eg["summary"], padding="max_length",truncation=True, return_tensors="pt").input_ids
  return eg

In [None]:
tokenized_datasets = dataset.map(tokenize_fun, batched=True)
tokenized_datasets

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['id','topic','dialogue','summary'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [None]:
tokenized_datasets.filter(
    lambda example, ind: ind%100 == 0,
    with_indices=True
)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})

**Fine-Tune Model with Preprocessed Dataset**

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# output_dir =  f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir = '/content/drive/MyDrive/dialogue-summary-training-checkpoints'

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    per_device_train_batch_size=4,
    max_steps=1,
    report_to="none"
)

trainer = Trainer(
    model=og_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,45.75


TrainOutput(global_step=1, training_loss=45.75, metrics={'train_runtime': 14.9049, 'train_samples_per_second': 0.268, 'train_steps_per_second': 0.067, 'total_flos': 2739029409792.0, 'train_loss': 45.75, 'epoch': 0.00032102728731942215})

In [None]:
from pathlib import Path

In [None]:
checkpoint_path = Path("/content/drive/MyDrive/dialogue-summary-training-checkpoints/checkpoint-1")

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint_path,
    local_files_only=True,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

In [None]:
!ls "/content/drive/MyDrive/dialogue-summary-training-checkpoints/checkpoint-1"

config.json		model.safetensors  rng_state.pth  trainer_state.json
generation_config.json	optimizer.pt	   scheduler.pt   training_args.bin


In [4]:
import nbformat; nb = nbformat.read(open("W2L1_Finetune_LLM_for_Dialogue_Summarization.ipynb"), as_version=4); nb['metadata'].pop('widgets', None); nbformat.write(nb, open("W2L1_Finetune_LLM_for_Dialogue_Summarization.ipynb", "w"))


FileNotFoundError: [Errno 2] No such file or directory: 'W2L1_Finetune_LLM_for_Dialogue_Summarization.ipynb'

In [10]:
import nbformat

# Path to your notebook saved in Google Drive
notebook_path = "/content/drive/MyDrive/Colab Notebooks/W2L1 Finetune LLM for Dialogue Summarization.ipynb"

# Load the notebook
nb = nbformat.read(open(notebook_path), as_version=4)

# Remove 'widgets' metadata
nb['metadata'].pop('widgets', None)

# Save it back
nbformat.write(nb, open(notebook_path, "w"))

print("Notebook cleaned. You can now push it to GitHub.")


Notebook cleaned. You can now push it to GitHub.


In [11]:
import nbformat

# Path with spaces
notebook_path = "/content/drive/MyDrive/Colab Notebooks/W2L1 Finetune LLM for Dialogue Summarization.ipynb"

# Load the notebook
with open(notebook_path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

# Remove 'widgets' metadata if it exists
nb['metadata'].pop('widgets', None)

# Save it back cleanly
with open(notebook_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print("✅ Cleaned and saved without widgets metadata.")


✅ Cleaned and saved without widgets metadata.


In [12]:
import json

with open(notebook_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print(data["metadata"].keys())  # Should NOT include 'widgets'


dict_keys(['accelerator', 'colab', 'kernelspec', 'language_info'])
