In [2]:
%pip install torch==1.13.1 --quiet
%pip install transformers==4.27.2 datasets==2.11.0 evaluate==0.4.0 peft==0.3.0 --quiet

In [3]:
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import evaluate
import pandas as pd
import numpy as np
import json

In [5]:
jsonl_file = '/data/custom_ds.jsonl'

data = []
file = open(jsonl_file, 'r',encoding='utf-8')
for line in file:
    data.append(json.loads(line))

In [6]:
data[0]

{'dialogue': '#C1#:Hey, do you know the secret to perfectly cooked rice?\n#C2#:Oh, absolutely! Its all about the water-to-rice ratio. I go for two cups of water for every cup of rice.\n#C1#: Really? Ive always used one and a half cups.\n#C2#: Well, that works too. Its about finding what works best for your rice and your taste.\n#C1#: True. And how about the type of rice? Long grain or short grain?\n#C2#: It depends on the dish. Long grain for pilaf, short grain for sushi. What are you cooking?\n#C1#: Im thinking of a stir-fry tonight. Maybe Ill go with jasmine rice.\n#C2#: Great choice! It has such a fragrant aroma.Just make sure to rinse it thoroughly before cooking.\n',
 'summary': '#C2# tells #C1# how to cook rice perfectly'}

In [7]:
data = pd.DataFrame(data)
data.head()

Unnamed: 0,dialogue,summary
0,"#C1#:Hey, do you know the secret to perfectly ...",#C2# tells #C1# how to cook rice perfectly
1,#C1#: I can never decide which rice to buy. Th...,#C1# and #C2# discuss about types of rice spec...
2,"#C1#: Hosting a dinner party, and rice is on t...",#C1# and #C2# discuss about how to cook rice i...
3,"#C1#: I tried making sushi rice at home, but i...",#C1# asks #C2# how to make sushi and #C2# teac...
4,#C1#: Have you ever tried black rice?\n#C2#: Y...,"The characters discuss black rice, highlightin..."


In [8]:
dataset = Dataset.from_pandas(data)

In [16]:
model_name='google/flan-t5-base'

#original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map = "auto", torch_dtype=torch.float16)
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained(model_name,device_map="auto")

# or
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto", torch_dtype=torch.float16)

In [10]:
# print(original_model.config)
print(tokenizer)

T5TokenizerFast(name_or_path='google/flan-t5-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>'

In [11]:
hidden_layers = original_model.config.num_hidden_layers
activation_function = original_model.config.dense_act_fn
hidden_layers,activation_function

(12, 'gelu_new')

In [None]:
# original_model.get_encoder()  # Encoder Config
# original_model.get_decoder()  # Decoder Config

In [12]:
def check_trainable_params(model):
    trainable_params = 0
    total_params = 0
    for layer_name, layer_wghts in model.named_parameters():
        total_params += layer_wghts.numel()
        if layer_wghts.requires_grad:
            trainable_params += layer_wghts.numel()
    return trainable_params,total_params

print(check_trainable_params(original_model))

(247577856, 247577856)


In [13]:
## CHECK Tokenizer

dialogue = dataset[0]['dialogue']
summary = dataset[0]['summary']

tokens = tokenizer(dialogue, return_tensors='pt')
tokens['input_ids'].shape,tokens['input_ids'].ndim

(torch.Size([1, 198]), 2)

In [17]:
## CHECK prediction on Original Model

dialogue = dataset[0]['dialogue']
summary = dataset[0]['summary']

prompt = f"""
Summarize the following conversation:
{dialogue}
Summary:
"""
inputs = tokenizer(prompt, return_tensors='pt')
#inputs = inputs['input_ids'].to("cuda")
inputs = inputs['input_ids']
model_summ = original_model.generate(inputs,max_new_tokens=20,temperature=0.1)
output = tokenizer.decode(model_summ[0])

print("\nInput Prompt:",prompt)
print("-------------------------")
print("\nHuman :",summary)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation:
#C1#:Hey, do you know the secret to perfectly cooked rice?
#C2#:Oh, absolutely! Its all about the water-to-rice ratio. I go for two cups of water for every cup of rice.
#C1#: Really? Ive always used one and a half cups.
#C2#: Well, that works too. Its about finding what works best for your rice and your taste.
#C1#: True. And how about the type of rice? Long grain or short grain?
#C2#: It depends on the dish. Long grain for pilaf, short grain for sushi. What are you cooking?
#C1#: Im thinking of a stir-fry tonight. Maybe Ill go with jasmine rice.
#C2#: Great choice! It has such a fragrant aroma.Just make sure to rinse it thoroughly before cooking.

Summary:

-------------------------

Human : #C2# tells #C1# how to cook rice perfectly
-------------------------
Model Output : <pad> #C1#: Hey, do you know the secret to perfectly cooked rice?</s>


In [26]:
# One Shot

dialogue21 = dataset[21]['dialogue']
summary21 = dataset[21]['summary']

prompt = f"""
Summarize the following conversation:
{dialogue21}
Summary:
{summary21}

Summarize the following conversation:
{dialogue}
"""
inputs = tokenizer(prompt, return_tensors='pt')
#inputs = inputs['input_ids'].to("cuda")
inputs = inputs['input_ids']
model_summ = original_model.generate(inputs,max_new_tokens=20,temperature=0.5)
output = tokenizer.decode(model_summ[0])

print("\nInput Prompt:",prompt)
print("-------------------------")
print("\nHuman :",summary)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation:
#C1#: Should we stay in a lodge within the jungle or a hotel arby?
#C2#: A jungle lodge for sure. Its part of the experience, waking up to the sounds of the wild. Hotels cant compete with that.

Summary:
The characters discuss accommodation preferences for the jungle safari, favoring the immersive experience of staying in a jungle lodge.

Summarize the following conversation:
#C1#:Hey, do you know the secret to perfectly cooked rice?
#C2#:Oh, absolutely! Its all about the water-to-rice ratio. I go for two cups of water for every cup of rice.
#C1#: Really? Ive always used one and a half cups.
#C2#: Well, that works too. Its about finding what works best for your rice and your taste.
#C1#: True. And how about the type of rice? Long grain or short grain?
#C2#: It depends on the dish. Long grain for pilaf, short grain for sushi. What are you cooking?
#C1#: Im thinking of a stir-fry tonight. Maybe Ill go with jasmine rice.
#C2#: Great ch

The speaker is going to stir-fry rice tonight : LOL :P:P !

In [27]:
# FEW SHOTS

prompt = f"""
Summarize the following conversation:
{dataset[25]['dialogue']}
Summary:
{dataset[25]['summary']}

Summarize the following conversation:
{dataset[27]['dialogue']}
Summary:
{dataset[27]['summary']}

Summarize the following conversation:
{dataset[30]['dialogue']}
Summary:
{dataset[30]['summary']}


Summarize the following conversation:
{dialogue}
"""
inputs = tokenizer(prompt, return_tensors='pt')
#inputs = inputs['input_ids'].to("cuda")
inputs = inputs['input_ids']
model_summ = original_model.generate(inputs,max_new_tokens=20,temperature=0.5)
output = tokenizer.decode(model_summ[0])

print("\nInput Prompt:",prompt)
print("-------------------------")
print("\nHuman :",summary)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation:
#C1#: Any ideas for the theme of our fashion show this year?
#C2#: How about a retro-inspired theme? Vintage fashion is making a comeback, and it allows for creative interpretations.

Summary:
The characters discuss potential themes for the fashion show, considering a retro-inspired theme for its versatility and current trendiness.

Summarize the following conversation:
#C1#: The dress rehearsal is tomorrow. Any last-minute concerns?
#C2#: Make sure the models are comfortable in their outfits and that the runway is well-lit. Small details can make a big difference.

Summary:
The characters discuss preparations for the dress rehearsal, focusing on model comfort and the importance of proper lighting on the runway.

Summarize the following conversation:
#C1#: We need sponsors for the fashion show. Any progress?
#C2#: Ive been in talks with a few local businesses. Theyre interested, especially if we can offer visibility and exclusive pa

The secret to perfectly cooked rice is to use two cups of water for every cup of rice: FAARR BETTER NOW

### PEFT/LoRA model for Fine-Tuning

In [28]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

Add LoRA adapter layers/parameters to the original LLM to be trained.

In [29]:
peft_model = get_peft_model(original_model,lora_config)
print(check_trainable_params(peft_model))

(3538944, 251116800)


In [45]:
3538944/251116800

0.014092820552029971

In [30]:
def tokenize_function(convo):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in convo["dialogue"]]
    convo['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    convo['labels'] = tokenizer(convo["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return convo

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['dialogue', 'summary'])

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [None]:
output_dir = f'/content/drive/My Drive/LLM practical/dialogue-summary-training/'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5)

peft_trainer = Trainer(model=peft_model,args=peft_training_args,train_dataset=tokenized_dataset)

In [32]:
# peft_trainer.train()
peft_model_path="/content/drive/My Drive/LLM practical/dialogue-summary-training/peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)

In [34]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       device_map = "auto",
                                       is_trainable=False)

In [35]:
print(check_trainable_params(peft_model))

(0, 251116800)


In [37]:
## EVALUATION

dialogue = dataset[0]['dialogue']
baseline_human_summary = dataset[0]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

inputs = tokenizer(prompt, return_tensors="pt")
#input = inputs["input_ids"].to("cuda")
input = inputs["input_ids"]

output = peft_model.generate(input_ids = input, max_new_tokens=20)
output = tokenizer.decode(output[0], skip_special_tokens=True)

print("\nInput Prompt:",prompt)
print("-------------------------")
print("\nHuman :",summary)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation.

#C1#:Hey, do you know the secret to perfectly cooked rice?
#C2#:Oh, absolutely! Its all about the water-to-rice ratio. I go for two cups of water for every cup of rice.
#C1#: Really? Ive always used one and a half cups.
#C2#: Well, that works too. Its about finding what works best for your rice and your taste.
#C1#: True. And how about the type of rice? Long grain or short grain?
#C2#: It depends on the dish. Long grain for pilaf, short grain for sushi. What are you cooking?
#C1#: Im thinking of a stir-fry tonight. Maybe Ill go with jasmine rice.
#C2#: Great choice! It has such a fragrant aroma.Just make sure to rinse it thoroughly before cooking.


Summary: 
-------------------------

Human : #C2# tells #C1# how to cook rice perfectly
-------------------------
Model Output : You can use a few different types of rice.


In [42]:
## ONE SHOT PROMPTING

dialogue21 = dataset[21]['dialogue']
summary21 = dataset[21]['summary']

prompt = f"""
Summarize the following conversation:
{dialogue21}
Summary:
{summary21}

Summarize the following conversation:
{dialogue}
Summary:
"""
inputs = tokenizer(prompt, return_tensors='pt')
#inputs = inputs['input_ids'].to("cuda")
inputs = inputs['input_ids']
model_summ = peft_model.generate(input_ids=inputs,max_new_tokens=20,temperature=0.5)
output = tokenizer.decode(model_summ[0])

print("\nInput Prompt:",prompt)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation:
#C1#: Should we stay in a lodge within the jungle or a hotel arby?
#C2#: A jungle lodge for sure. Its part of the experience, waking up to the sounds of the wild. Hotels cant compete with that.

Summary:
The characters discuss accommodation preferences for the jungle safari, favoring the immersive experience of staying in a jungle lodge.

Summarize the following conversation:
#C1#:Hey, do you know the secret to perfectly cooked rice?
#C2#:Oh, absolutely! Its all about the water-to-rice ratio. I go for two cups of water for every cup of rice.
#C1#: Really? Ive always used one and a half cups.
#C2#: Well, that works too. Its about finding what works best for your rice and your taste.
#C1#: True. And how about the type of rice? Long grain or short grain?
#C2#: It depends on the dish. Long grain for pilaf, short grain for sushi. What are you cooking?
#C1#: Im thinking of a stir-fry tonight. Maybe Ill go with jasmine rice.
#C2#: Great ch

In [43]:
# FEW SHOTS

prompt = f"""
Summarize the following conversation:
{dataset[25]['dialogue']}
Summary:
{dataset[25]['summary']}

Summarize the following conversation:
{dataset[27]['dialogue']}
Summary:
{dataset[27]['summary']}

Summarize the following conversation:
{dataset[30]['dialogue']}
Summary:
{dataset[30]['summary']}


Summarize the following conversation:
{dialogue}
"""
inputs = tokenizer(prompt, return_tensors='pt')
#inputs = inputs['input_ids'].to("cuda")
inputs = inputs['input_ids']
model_summ = peft_model.generate(input_ids=inputs,max_new_tokens=30,temperature=0.7)
output = tokenizer.decode(model_summ[0])

print("\nInput Prompt:",prompt)
print("-------------------------")
print("\nHuman :",summary)
print("-------------------------")
print("Model Output :",output)


Input Prompt: 
Summarize the following conversation:
#C1#: Any ideas for the theme of our fashion show this year?
#C2#: How about a retro-inspired theme? Vintage fashion is making a comeback, and it allows for creative interpretations.

Summary:
The characters discuss potential themes for the fashion show, considering a retro-inspired theme for its versatility and current trendiness.

Summarize the following conversation:
#C1#: The dress rehearsal is tomorrow. Any last-minute concerns?
#C2#: Make sure the models are comfortable in their outfits and that the runway is well-lit. Small details can make a big difference.

Summary:
The characters discuss preparations for the dress rehearsal, focusing on model comfort and the importance of proper lighting on the runway.

Summarize the following conversation:
#C1#: We need sponsors for the fashion show. Any progress?
#C2#: Ive been in talks with a few local businesses. Theyre interested, especially if we can offer visibility and exclusive pa

SO FINE TUNING WITH LORA AND WITH FEW SHOTS PROMPTING, GAVE THE BEST RESULT!!

In [None]:
# rouge = evaluate.load('rouge')

# peft_model_results = rouge.compute(
#     predictions=summary,
#     references=output,
#     use_aggregator=True,
#     use_stemmer=True,
# )