In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import PeftModel, PeftConfig


In [3]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForSeq2SeqLM 

config = AutoConfig.from_pretrained("google/flan-t5-base",torch_dtype=torch.bfloat16)

with init_empty_weights():
  model_config = AutoModelForSeq2SeqLM.from_config(config)


In [4]:
device_map = infer_auto_device_map(model_config) #use this when working with large models. For example Flan T5 xxl is 11b, this would split the model layers on different devices (gpu, cpu, disk..)

In [5]:
device_map

{'': 0}

In [6]:

model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large",torch_dtype=torch.bfloat16, device_map=device_map, resume_download=True,offload_folder="offload")
tokenizer = AutoTokenizer.from_pretrained('declare-lab/flan-alpaca-large')

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-large and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# lab_dataset_name = "knkarthick/dialogsum"
samsum_dataset = "samsum"
dataset = load_dataset(samsum_dataset)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# Let's see what the baseline is

In [23]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries))#, instruct_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Amanda can't find Betty's phone number. She su...
1,Eric and Rob are going to watch a stand-up on ...,Eric and Rob are discussing a Russian stand-up...
2,Lenny can't decide which trousers to buy. Bob ...,Bob helps Lenny choose the best pair of trouse...
3,Emma will be home soon and she will let Will k...,Emma is not hungry and will be home soon.
4,Jane is in Warsaw. Ollie and Jane has a party....,Ollie and Jane are discussing their plans for ...
5,Hilary has the keys to the apartment. Benjamin...,Benjamin will join Hilary and Elliot for the d...
6,Payton provides Max with websites selling clot...,Max will check out some good sites to buy clot...
7,Rita and Tina are bored at work and have still...,Rita is feeling tired and is struggling to sta...
8,"Beatrice wants to buy Leo a scarf, but he does...",Beatrice is shopping for a scarf. Leo doesn't ...
9,Eric doesn't know if his parents let him go to...,Eric is not sure if he will be able to attend ...


In [24]:
rouge = evaluate.load('rouge')
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAL MODEL:')
print(original_model_results)

ORIGINAL MODEL:
{'rouge1': 0.47702423811341355, 'rouge2': 0.15552013016300872, 'rougeL': 0.32264984325924867, 'rougeLsum': 0.3200369799320997}


# Now we move onto tokenizing the data and preparing it for fine tuning

In [8]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 783150080
all model parameters: 783150080
percentage of trainable model parameters: 100.00%


In [9]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").to(device_map).input_ids 
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").to(device_map).input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Attempting to cast a BatchEncoding to type {'': 0}. This is not supported.
Map: 100%|██████████| 819/819 [00:00<00:00, 2966.46 examples/s]


In [10]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 2 == 0, with_indices=True) # filtering the dataset, just to verify everything is working. You could skip this step. 
tokenized_datasets

Filter: 100%|██████████| 819/819 [00:00<00:00, 4696.12 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 7366
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 410
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
        num_rows: 409
    })
})

In [11]:

tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary',]) # we only need to feed input ids and labels to llm

In [12]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

## I run out of memory, so I skip this part and build a peft model. You can omit the peft part if you have enough memeory, but don't forget to start the training. 

In [12]:
# trainer.train() #will run out of mem on my local machine, 16gb 4080 Nividia. 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiklpuerto69[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 15.99 GiB total capacity; 30.01 GiB already allocated; 0 bytes free; 30.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# going to use peft to fine tune

In [16]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # seq 2 seq :)
)

In [17]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))# updating 1.2% of the model

trainable model parameters: 9437184
all model parameters: 792587264
percentage of trainable model parameters: 1.19%


In [19]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir, # where to store checkpoints 
    auto_find_batch_size=True,
    learning_rate=1e-5,
    num_train_epochs=10,
    logging_steps=1,
    max_steps=20    
) # could use  save_total_limit=2,overwrite_output_dir=True to limit number of checkpoints saved.
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [20]:
peft_trainer.train()


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiklpuerto69[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,35.0
2,36.0
3,34.25
4,35.25
5,36.0
6,33.75
7,36.25
8,36.25
9,35.75
10,34.75


TrainOutput(global_step=20, training_loss=35.5125, metrics={'train_runtime': 66.9286, 'train_samples_per_second': 1.195, 'train_steps_per_second': 0.299, 'total_flos': 186700718407680.0, 'train_loss': 35.5125, 'epoch': 0.01})

In [None]:
# You can use these to access the tuned models without saving. But for completeness, I show how to save and load peft adapter below. 
#  trainer.model.generate() 
# peft_trainer.model.generate()

In [None]:

# peft_model_path="./peft-dialogue-summary-checkpoint-local" # where to store model
# peft_trainer.model.save_pretrained(peft_model_path)
# tokenizer.save_pretrained(peft_model_path)

In [30]:
# Using this to load model back in. Notice you need load the base model, then your own fine tuned peft adapter, with the base model, using the PeftModel function. 
# from peft import PeftModel, PeftConfig
# peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-large")
# peft_model = PeftModel.from_pretrained(peft_model_base, 
#                                        './peft-dialogue-summary-checkpoint-local/', 
#                                        torch_dtype=torch.bfloat16,
#                                        is_trainable=False).to('cuda')

In [21]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# Using the finetuned model right after training, didn't even save it. 
peft_model_outputs = peft_trainer.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)) # NOTICE THAT YOU CAN YOUS THE MODEL RIGHT AFTER TRAINING. Traine.Model.Generate <---
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
dash_line = "-----------"*10
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

--------------------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Sam won't finish work till 5. Sam is bringing him over about 9 am. Sam will see Abdellilah in the morning. 
--------------------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Sam is at work and he finishes at 5 pm. He is bringing over his friend, who he hasn't met yet, at about 9 pm.
--------------------------------------------------------------------------------------------------------------
PEFT MODEL: Sam is finishing work at 5pm and will bring him over to Abdellilah's place in the morning.


In [22]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda').input_ids

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_trainer.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    # instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,"Hannah is looking for Betty's number, but Aman...",Amanda hasn't found Betty's number. She sugges...
1,Eric and Rob are going to watch a stand-up on ...,Eric and Rob are discussing the funny video th...,Eric and Rob are discussing a stand-up by a Ru...
2,Lenny can't decide which trousers to buy. Bob ...,Lenny is looking for a new pair of trousers. B...,Bob suggests buying two pairs of black trouser...
3,Emma will be home soon and she will let Will k...,Emma is not hungry and will be home soon.,Emma is going to be home soon and will tell Wi...
4,Jane is in Warsaw. Ollie and Jane has a party....,Ollie and Jane will meet on Friday for tea. Ol...,Jane is in Warsaw. Ollie and Jane are having a...
5,Hilary has the keys to the apartment. Benjamin...,Benjamin and Hilary are going to meet at the c...,Benjamin and Hilary are going to meet at 2 pm ...
6,Payton provides Max with websites selling clot...,Payton recommends two or three good websites t...,Payton will check out the different sites Max ...
7,Rita and Tina are bored at work and have still...,Rita and Tina are both exhausted and are strug...,Rita and Tina are both exhausted and frustrate...
8,"Beatrice wants to buy Leo a scarf, but he does...",Beatrice is shopping in the town. She is looki...,Beatrice is shopping in the shop next to the c...
9,Eric doesn't know if his parents let him go to...,Eric is not sure if he will be coming to his b...,Ivan will take care of Eric's parents's needs ...


In [34]:
rouge = evaluate.load('rouge')



original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print(dash_line)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.3217981141980334, 'rouge2': 0.10913502599809125, 'rougeL': 0.25168251357213534, 'rougeLsum': 0.25222310023454186}
--------------------------------------------------------------------------------------------------------------
PEFT MODEL:
{'rouge1': 0.35288076534605095, 'rouge2': 0.11493664411813231, 'rougeL': 0.2844235151687794, 'rougeLsum': 0.283610472781488}
