In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import PeftModel, PeftConfig


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lab_dataset_name = "knkarthick/dialogsum"
samsum_dataset = "samsum"
# dataset = load_dataset(huggingface_dataset_name)
dataset = load_dataset(samsum_dataset)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [3]:
from transformers import BartTokenizer, BartModel, AutoModelForSeq2SeqLM
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base',torch_dtype=torch.bfloat16).to('cuda')

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 139420416
all model parameters: 139420416
percentage of trainable model parameters: 100.00%


In [5]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue.replace('#','').replace('\n',' ') + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").to('cuda').input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").to('cuda').input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 14732/14732 [00:14<00:00, 1038.69 examples/s]


In [6]:
# tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 5 == 0, with_indices=True)
# tokenized_datasets

Filter: 100%|██████████| 14732/14732 [00:05<00:00, 2577.73 examples/s]
Filter: 100%|██████████| 819/819 [00:00<00:00, 2558.72 examples/s]
Filter: 100%|██████████| 818/818 [00:00<00:00, 2409.90 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 2947
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 164
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 164
    })
})

In [7]:

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary',])

In [8]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [9]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiklpuerto69[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,15.625


TrainOutput(global_step=1, training_loss=15.625, metrics={'train_runtime': 5.2914, 'train_samples_per_second': 1.512, 'train_steps_per_second': 0.189, 'total_flos': 4877891665920.0, 'train_loss': 15.625, 'epoch': 0.0})

In [10]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries))#, instruct_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Summarize the following conversation.Hannah: H...
1,Eric and Rob are going to watch a stand-up on ...,SumGates:Summarize the following. warr;;;;;;;;...
2,Lenny can't decide which trousers to buy. Bob ...,Summarize the following conversation. Palestin...
3,Emma will be home soon and she will let Will k...,Summarize the following conversation. ________...
4,Jane is in Warsaw. Ollie and Jane has a party....,": we have a party, we have to be there, we mus..."
5,Hilary has the keys to the apartment. Benjamin...,Hebrewing: I'm going to take the keys. I'm goi...
6,Payton provides Max with websites selling clot...,"maxwell: I'll be there, I'll be there, I'll be..."
7,Rita and Tina are bored at work and have still...,Summarize the following conversation. <@ <@ <@...
8,"Beatrice wants to buy Leo a scarf, but he does...",Summarize the following conversation.Sumbrace:...
9,Eric doesn't know if his parents let him go to...,Healso: you can't resist the new-Summarize the...


In [11]:
rouge = evaluate.load('rouge')
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAL MODEL:')
print(original_model_results)

ORIGINAL MODEL:
{'rouge1': 0.1286948901871337, 'rouge2': 0.03782576591786982, 'rougeL': 0.09377177230202544, 'rougeLsum': 0.09504498814000252}


In [12]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [13]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1769472
all model parameters: 141189888
percentage of trainable model parameters: 1.25%


In [14]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [15]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss
1,15.25


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/vocab.json',
 './peft-dialogue-summary-checkpoint-local/merges.txt',
 './peft-dialogue-summary-checkpoint-local/added_tokens.json')

In [16]:
from peft import PeftModel, PeftConfig

peft_model = PeftModel.from_pretrained(model, 
                                       './peft-dialogue-summary-checkpoint-local/', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [17]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
dash_line = "-----------"*10
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

--------------------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Sam won't finish work till 5. Sam is bringing him over about 9 am. Sam will see Abdellilah in the morning. 
--------------------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Summarize the following conversation. exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting extingAbdellilah: Where are you? exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting extingAbdellilah: What time you finish? exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting exting extingAbdellilah: Are your bringing him over tonight: exting exting exting exting exting exting exting exting ext

In [18]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    # instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Hannah: ______________________________________...,Hannah: ______________________________________...
1,Eric and Rob are going to watch a stand-up on ...,Summarize the following conversation. exting e...,Summarize the following conversation. exting e...
2,Lenny can't decide which trousers to buy. Bob ...,Summarize the following conversation. exting e...,Summarize the following conversation. exting e...
3,Emma will be home soon and she will let Will k...,Emma: ...,Emma: ...
4,Jane is in Warsaw. Ollie and Jane has a party....,Summarize the following conversation. exting e...,Summarize the following conversation. exting e...
5,Hilary has the keys to the apartment. Benjamin...,______________________________________________...,______________________________________________...
6,Payton provides Max with websites selling clot...,Summarize the following conversation.=========...,Summarize the following conversation.=========...
7,Rita and Tina are bored at work and have still...,----------------------------------------------...,----------------------------------------------...
8,"Beatrice wants to buy Leo a scarf, but he does...",Summarize the following conversation. Heh. Heh...,Summarize the following conversation. Heh. Heh...
9,Eric doesn't know if his parents let him go to...,Summarize the following conversation. exting e...,Summarize the following conversation. exting e...


In [19]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print(dash_line)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.12187749464158132, 'rouge2': 0.0264697567841862, 'rougeL': 0.1007889150831274, 'rougeLsum': 0.10057937832803249}
--------------------------------------------------------------------------------------------------------------
PEFT MODEL:
{'rouge1': 0.12187749464158132, 'rouge2': 0.0264697567841862, 'rougeL': 0.1007889150831274, 'rougeLsum': 0.10057937832803249}
