In [5]:
!pip install torch torchdata transformers datasets evaluate rouge_score loralib peft --quiet

In [None]:
#from datasets import load_dataset
#from transformers import AutoModelForSeq2SeqLM, -- python library for transformers
#AutoTokenizer, 
#GenerationConfig, 
#TrainingArguments -- ways that simplifies code when fine tuning lllm
#Trainer -- ways that simplifies code when fine tuning lllm
#import torch
#import time
#import evaluate
#import pandas as pd
#import numpy as np

In [6]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [7]:
ds_hf = "knkarthick/dialogsum"

ds = load_dataset(ds_hf)

Found cached dataset csv (C:/Users/natalr2/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [9]:
# load the model using Flan t5 -- flan t5 is a general model that can do wide variety of tasks

model_name= 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) #torch_dtype=torch.bfloat16 specify the memory type to be used in this model

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params} \npercentage of trainable model parameters: ({all_model_params}/{trainable_model_params})\n"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856 
percentage of trainable model parameters: (247577856/247577856)



In [17]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation. \n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation = True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding = "max_length", truncation = True, return_tensors="pt").input_ids
    
    return example
    
# dataset contains train, validation, test
# tokenize_function is handling all data accross splits in batches
tokenized_datasets = ds.map(tokenize_function, batched= True)
tokenized_datasets = tokenized_datasets.remove_columns(['id','topic','dialogue', 'summary',])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices = True) #subsampling to save time

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [22]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(125, 2)
(5, 2)
(15, 2)


In [23]:
print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [24]:
#utilizing the hf TRainer class, passing the pre processed dataset with reference to the original model, adding some training parameters experimentally

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1, #increase the number
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1 #increase size
)

trainer = Trainer(
    model=original_model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
)

In [25]:
trainer.train()

Step,Training Loss
1,47.5


TrainOutput(global_step=1, training_loss=47.5, metrics={'train_runtime': 175.8005, 'train_samples_per_second': 0.046, 'train_steps_per_second': 0.006, 'total_flos': 5478058819584.0, 'train_loss': 47.5, 'epoch': 0.06})

In [None]:
# download checkpoint of fully fine tuned model to use in the rest of the note book
#!aws s3 cp --recursive s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/ ./flan-dialogue-summary-checkpoint/

In [None]:
#!ls -alh ./flan-dialogue-summary-checkpoint/pytorch_model.bin

In [30]:
trainer.save_model("/dialogue-summary-training-1702507973")

In [36]:
#create a instance of the Automodelfor seq2seqlmclass for the instrument model:

instruct_model = AutoModelForSeq2SeqLM.from_pretrained("/dialogue-summary-training-1702507973/", torch_dtype=torch.bfloat16)

In [42]:
#evaluate the model quantitativelly -- ROUGE

index = 200
dialogue = ds['test'][index]['dialogue']
human_baseline_summary = ds['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""


input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))                                               
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
                                                 

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)                                                


dash_line = '-'.join('' for x in range(100))

                                                 
print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT PROMPT:\n{instruct_model_text_output}')                                 

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL: 
#Person1#: I'm not sure what exactly I would need to upgrade my software.
---------------------------------------------------------------------------------------------------
INSTRUCT PROMPT:
#Person1: I'm thinking of upgrading your system.


In [43]:
#load rouge
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [47]:
dialogues = ds['test'][0:10]['dialogue']
human_baseline_summaries = ds['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)
    
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ["human_baseline_summaries", "original_model_summaries", "instruct_model_summaries"])

In [49]:
original_model_results = rouge.compute(
    predictions = original_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True,
)

instruct_model_results = rouge.compute(
    predictions = instruct_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True,
)


print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.21948246198246202, 'rouge2': 0.06779380040249605, 'rougeL': 0.18491961741961746, 'rougeLsum': 0.18630221630221633}
INSTRUCT MODEL:
{'rouge1': 0.24089921652421653, 'rouge2': 0.11769053708439897, 'rougeL': 0.22001958689458687, 'rougeLsum': 0.22134175465057818}


In [50]:
#PEFT/lora for fine
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #rank -- high
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)
    

In [52]:
peft_model = get_peft_model(original_model, 
                            lora_config)

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800 
percentage of trainable model parameters: (251116800/3538944)



In [53]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size = True,
    learning_rate=1e-5, #higher than full fine-tuning
    num_train_epochs=1, #increase the number
    logging_steps=1,
    max_steps=1 #increase size
)

peft_trainer = Trainer(
    model=peft_model,
    args = peft_training_args,
    train_dataset = tokenized_datasets['train'],
)

In [54]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,48.0


('./peft-dialogue-summary-checkpoint-local\\tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local\\special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local\\tokenizer.json')

In [60]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       "./peft-dialogue-summary-checkpoint-local/",
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False #setting to false will tell torch that I'm not interested in train the model, just get the model -- basically minimize the footprint
                                      )


In [61]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800 
percentage of trainable model parameters: (251116800/0)



In [62]:
#evaluate the model quantitativelly -- ROUGE

index = 200
dialogue = ds['test'][index]['dialogue']
human_baseline_summary = ds['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""


input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))                                               
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
                                                 

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)  

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)



dash_line = '-'.join('' for x in range(100))

                                                 
print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT PROMPT:\n{instruct_model_text_output}')      
print(dash_line)
print(f'PEFT PROMPT:\n{peft_model_text_output}')   

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL: 
#Person1: I'd like to upgrade my computer hardware. #Person1: I'm not sure what I'd need. #Person1: I'd like to upgrade my computer. #Person1: I'd like to make my own flyers and banners. #Person2: I'd like to add a CD-ROM drive. #Person1: I'm not sure what I'd need. #Person2: I'd like to add a CD-ROM drive.
---------------------------------------------------------------------------------------------------
INSTRUCT PROMPT:
#Person1#: I'm thinking of upgrading my computer.
---------------------------------------------------------------------------------------------------
PEFT PROMPT:
#Person1#: I'm thinking of upgrading my computer.


In [67]:
dialogues = ds['test'][0:10]['dialogue']
human_baseline_summaries = ds['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)
    
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)
    
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ["human_baseline_summaries", "original_model_summaries", "instruct_model_summaries", "peft_model_summaries"])

In [68]:
original_model_results = rouge.compute(
    predictions = original_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True,
)

instruct_model_results = rouge.compute(
    predictions = instruct_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True,
)


peft_model_results = rouge.compute(
    predictions = peft_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True,
)



print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results) #despite the similar results, PEFT uses way less resources in a big scale

ORIGINAL MODEL:
{'rouge1': 0.2657862258847701, 'rouge2': 0.11158190601668863, 'rougeL': 0.22359027281627897, 'rougeLsum': 0.2275186552353735}
INSTRUCT MODEL:
{'rouge1': 0.24089921652421653, 'rouge2': 0.11769053708439897, 'rougeL': 0.22001958689458687, 'rougeLsum': 0.22134175465057818}
PEFT MODEL:
{'rouge1': 0.241950545026632, 'rouge2': 0.1179539641943734, 'rougeL': 0.22166387959866218, 'rougeLsum': 0.22283940294809862}
