In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import PeftModel, PeftConfig


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForSeq2SeqLM 

config = AutoConfig.from_pretrained("google/flan-t5-base",torch_dtype=torch.bfloat16)

with init_empty_weights():
  model_config = AutoModelForSeq2SeqLM.from_config(config)


use this when working with large models. For example Flan T5 xxl is 11b, this would split the model layers on different devices (gpu, cpu, disk..)

In [3]:
device_map = infer_auto_device_map(model_config) 

In [4]:
device_map

{'': 0}

In [5]:

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",torch_dtype=torch.bfloat16, device_map=device_map, resume_download=True,offload_folder="offload")
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# lab_dataset_name = "knkarthick/dialogsum"
samsum_dataset = "samsum"
dataset = load_dataset(samsum_dataset)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# Let's see what the baseline is

In [7]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries))#, instruct_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Amanda can't find Betty's number. Amanda will ...
1,Eric and Rob are going to watch a stand-up on ...,Eric and Rob are watching a stand-up. Eric and...
2,Lenny can't decide which trousers to buy. Bob ...,Lenny wants to buy two pairs of purple trouser...
3,Emma will be home soon and she will let Will k...,Emma will be home soon. Will will pick her up.
4,Jane is in Warsaw. Ollie and Jane has a party....,Jane lost her calendar. Ollie and Jane have lu...
5,Hilary has the keys to the apartment. Benjamin...,Hilary and Elliot are meeting at the conferenc...
6,Payton provides Max with websites selling clot...,Payton likes shopping but he doesn't always bu...
7,Rita and Tina are bored at work and have still...,Rita is tired and is not able to concentrate a...
8,"Beatrice wants to buy Leo a scarf, but he does...","Beatrice is in town, shopping. She has a scarf..."
9,Eric doesn't know if his parents let him go to...,Eric is coming to Ivan's brother's wedding. Er...


In [8]:
rouge = evaluate.load('rouge')
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAL MODEL:')
print(original_model_results)

ORIGINAL MODEL:
{'rouge1': 0.4679849216912261, 'rouge2': 0.22526047821224032, 'rougeL': 0.382630836947357, 'rougeLsum': 0.37794076244345265}


# Now we move onto tokenizing the data and preparing it for fine tuning

In [9]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [10]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").to(device_map).input_ids 
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").to(device_map).input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Map:   7%|▋         | 1000/14732 [00:00<00:04, 2989.55 examples/s]Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Map:  14%|█▎        | 2000/14732 [00:00<00:03, 3473.46 examples/s]Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Map:  20%|██        | 3000/14732 [00:00<00:03, 3708.84 examples/s]Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Map:  27%|██▋       | 4000/14732 [00:01<00:02, 3823.14 examples/s]Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is n

filtering the dataset, just to verify everything is working. You could skip this step. 

In [11]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 2 == 0, with_indices=True)
tokenized_datasets

Filter: 100%|██████████| 14732/14732 [00:03<00:00, 4758.12 examples/s]
Filter: 100%|██████████| 819/819 [00:00<00:00, 4769.11 examples/s]
Filter: 100%|██████████| 818/818 [00:00<00:00, 4685.33 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 7366
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 410
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 409
    })
})

we only need to feed input ids and labels to llm

In [12]:

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary',]) # we only need to feed input ids and labels to llm

In [13]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

## I run out of memory, so I skip this part and build a peft model. You can omit the peft part if you have enough memory, but don't forget to start the training.
### if wandb doesn't automatically track this job, you can import wandb package. wandb.login(<api_key>) and within the training args, report_to="wandb"

In [14]:
# trainer.train() #will run out of mem on my local machine, 16gb 4080 Nividia. 

# using peft to fine tune

In [15]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # seq 2 seq :)
)

In [16]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))# updating 1.2% of the model

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [17]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir, # where to store checkpoints 
    auto_find_batch_size=True,
    learning_rate=1e-5,
    num_train_epochs=10,
    logging_steps=1,
    max_steps=20    
) # could use  save_total_limit=2,overwrite_output_dir=True to limit number of checkpoints saved.
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

### # if wandb doesn't automatically track this job, you can import wandb package. wandb.login(<api_key>) and within the training args, report_to="wandb"

In [18]:
peft_trainer.train()
# if wandb doesn't automatically track this job, you can import wandb package. wandb.login(<api_key>) and within the training args, report_to="wandb"

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiklpuerto69[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,44.25
2,44.75
3,45.25
4,47.0
5,43.0
6,46.0
7,45.25
8,44.25
9,43.25
10,47.0


TrainOutput(global_step=20, training_loss=45.7125, metrics={'train_runtime': 11.2898, 'train_samples_per_second': 14.172, 'train_steps_per_second': 1.772, 'total_flos': 111300638146560.0, 'train_loss': 45.7125, 'epoch': 0.02})

You can use these to access the tuned models without saving. But for completeness, I show how to save and load peft adapter below. 

In [19]:
#  trainer.model.generate() 
# peft_trainer.model.generate()

In [20]:

# peft_model_path="./peft-dialogue-summary-checkpoint-local" # where to store model
# peft_trainer.model.save_pretrained(peft_model_path)
# tokenizer.save_pretrained(peft_model_path)

Using this to load model back in. Notice you need load the base model, then your own fine tuned peft adapter, with the base model, using the PeftModel function. 

In [21]:

# from peft import PeftModel, PeftConfig
# peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-large")
# peft_model = PeftModel.from_pretrained(peft_model_base, 
#                                        './peft-dialogue-summary-checkpoint-local/', 
#                                        torch_dtype=torch.bfloat16,
#                                        is_trainable=False).to('cuda')

In [22]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# Using the finetuned model right after training, didn't even save it. 
peft_model_outputs = peft_trainer.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)) # NOTICE THAT YOU CAN YOUS THE MODEL RIGHT AFTER TRAINING. Traine.Model.Generate <---
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
dash_line = "-----------"*10
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

--------------------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Sam won't finish work till 5. Sam is bringing him over about 9 am. Sam will see Abdellilah in the morning. 
--------------------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Sam is bringing Abdellilah over tonight at about 9.
--------------------------------------------------------------------------------------------------------------
PEFT MODEL: Abdellilah and Sam are meeting tonight at 9 am.


In [23]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_trainer.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    # instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Amanda needs Hannah's number. Hannah wants to ...,Hannah wants to text her boyfriend. Amanda wil...
1,Eric and Rob are going to watch a stand-up on ...,Eric likes the train part of the show. Eric wi...,Eric and Rob are going to watch a Russian stan...
2,Lenny can't decide which trousers to buy. Bob ...,Lenny wants to buy purple trousers. Bob advise...,Bob recommends Lenny to buy a pair of purple t...
3,Emma will be home soon and she will let Will k...,Emma is going home tonight. Will will pick her...,Will and Emma are going to have dinner tonight...
4,Jane is in Warsaw. Ollie and Jane has a party....,Jane is in Warsaw. Ollie reminds Jane that the...,Jane forgot about the party on the 18th and 19...
5,Hilary has the keys to the apartment. Benjamin...,Hilary will meet Hilary at lunchtime and they ...,"Hilary, Elliot, Hilary and Hilary will meet Hi..."
6,Payton provides Max with websites selling clot...,Payton recommends buying clothes from a lot of...,Max wants to buy clothes from Payton.
7,Rita and Tina are bored at work and have still...,Rita is tired and is nay nauseous at work.,Rita is very tired and is tired at work. Tina ...
8,"Beatrice wants to buy Leo a scarf, but he does...",Leo doesn't need a scarf. Beatrice will buy on...,Beatrice is in town. She is looking for a scar...
9,Eric doesn't know if his parents let him go to...,Eric is coming to the wedding. Eric has a lot ...,Ivan is going to the wedding. Eric is not sure...


In [24]:
rouge = evaluate.load('rouge')



original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print(dash_line)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.38409263171848673, 'rouge2': 0.10797583164329998, 'rougeL': 0.31090430499732824, 'rougeLsum': 0.30698142167909614}
--------------------------------------------------------------------------------------------------------------
PEFT MODEL:
{'rouge1': 0.4149934904657543, 'rouge2': 0.1887847474803997, 'rougeL': 0.3265277737241755, 'rougeLsum': 0.3270087484292882}
