In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet
%pip install evaluate
%pip install rouge_score
%pip install \
    transformers==4.27.2 \
    datasets==2.11.0  --quiet

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, Trainer, TrainingArguments
import torch
import time
import pandas as pd
import numpy as np
import evaluate

In [2]:
# !pip install --upgrade datasets
# !pip install awscli
# !pip install peft

In [2]:
huggingface_dataset_name = 'knkarthick/dialogsum'
dataset = load_dataset(huggingface_dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
model_name = 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [4]:
def print_number_of_trainable_model_parameters(model):
  all_model_params = 0
  trainable_model_params = 0

  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()

  return f"trainable model parameters : {trainable_model_params}\nall model parameter : {all_model_params}\npercentage of trainable model parameters : {trainable_model_params/all_model_params*100 :.2f}%"


print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters : 247577856
all model parameter : 247577856
percentage of trainable model parameters : 100.00%


In [5]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation :

{dialogue}

Summary :
"""

inputs = tokenizer(prompt, return_tensors = 'pt')
output = tokenizer.decode(
                      original_model.generate(inputs['input_ids'],
                                              max_new_tokens = 200)[0],
                      skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation :

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary :

-----------------------------------------------------------------

In [6]:
def tokenize_function(example):
  start_prompt = 'Summarize the following conversation. \n\n'
  end_prompt = '\n\nSummary: '
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt, padding = 'max_length', truncation = True ,return_tensors = 'pt').input_ids
  example['labels'] = tokenizer(example['summary'], padding = 'max_length', truncation = True ,return_tensors = 'pt').input_ids

  return example

tokenized_datasets = dataset.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['id','topic','dialogue','summary'])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
tokenized_datasets = tokenized_datasets.filter(lambda example,index : index%100 == 0, with_indices = True)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [9]:
# output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     learning_rate=1e-5,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_steps=1,
#     max_steps=1
# )

# trainer = Trainer(
#     model=original_model,
#     args=training_args,
#     train_dataset=tokenized_datasets['train'],
#     eval_dataset=tokenized_datasets['validation']
# )

In [10]:
# trainer.train()


In [11]:
!aws configure

AWS Access Key ID [None]: AKIAS2VS4JARWX4ES65B
AWS Secret Access Key [None]: 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
Default region name [None]: us-east-2
Default output format [None]: json


AWS Access Key ID [None]: AKIAS2VS4JARWX4ES65B
AWS Secret Access Key [None]: 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
Default region name [None]: us-east-2
Default output format [None]: json


AWS Access Key ID [None]: AKIAS2VS4JARWX4ES65B
AWS Secret Access Key [None]: 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
Default region name [None]: us-east-2
Default output format [None]: json

In [12]:
!aws s3 cp --recursive s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/ ./flan-dialogue-summary-checkpoint/

Completed 627 Bytes/2.8 GiB (291 Bytes/s) with 8 file(s) remainingdownload: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/scheduler.pt to flan-dialogue-summary-checkpoint/scheduler.pt
Completed 627 Bytes/2.8 GiB (291 Bytes/s) with 7 file(s) remainingCompleted 769 Bytes/2.8 GiB (355 Bytes/s) with 7 file(s) remainingdownload: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/generation_config.json to flan-dialogue-summary-checkpoint/generation_config.json
download: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/config.json to flan-dialogue-summary-checkpoint/config.json
download: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/trainer_state.json to flan-dialogue-summary-checkpoint/trainer_state.json
download: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/training_args.bin to flan-dialogue-summary-checkpoint/training_args.bin
download: s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/rng_s

In [13]:
!ls -alh ./flan-dialogue-summary-checkpoint/pytorch_model.bin

-rw-r--r-- 1 root root 945M May 15  2023 ./flan-dialogue-summary-checkpoint/pytorch_model.bin


In [14]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)

In [23]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation :

{dialogue}

Summary :
"""

generate_config = GenerationConfig(max_new_tokens = 200, num_beams = 1)
input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids

original_model_output = tokenizer.decode(
                      original_model.generate(input_ids,
                                              generation_config = generate_config)[0],
                      skip_special_tokens = True
)

instruct_model_output = tokenizer.decode(
                      instruct_model.generate(input_ids,
                                              generation_config = generate_config)[0],
                      skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_output}\n')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_output}\n')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation :

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary :

-----------------------------------------------------------------

In [15]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
dialogues = dataset['test'][8:10]['dialogue']
summaries = dataset['test'][8:10]['summary']
generate_config = GenerationConfig(max_new_tokens = 200, num_beams = 1)

original_model_summaries = []
instruct_model_summaries = []

for _,dialogue in enumerate(dialogues):
  prompt = f"""
  Summarize the following conversation :

  {dialogue}

  Summary :
  """

  input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids
  original_model_output = tokenizer.decode(
                        original_model.generate(input_ids,
                                                generation_config = generate_config)[0],
                        skip_special_tokens = True
  )

  instruct_model_output = tokenizer.decode(
                        instruct_model.generate(input_ids,
                                                generation_config = generate_config)[0],
                        skip_special_tokens = True
  )

  original_model_summaries.append(original_model_output)
  instruct_model_summaries.append(instruct_model_output)

zipped_summaries = list(zip(summaries,original_model_summaries,instruct_model_summaries))
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries','original_model_summaries','instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can'...
1,#Person1# and Brian are at the birthday party ...,Brian's birthday is coming up.,Brian's birthday is coming. #Person1# invites ...


In [17]:
original_model_scores = rouge.compute(
    predictions = original_model_summaries,
    references = summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

instruct_model_scores = rouge.compute(
    predictions = instruct_model_summaries,
    references = summaries[0:len(instruct_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

print(f'ORIGINAL MODEL RESULTS :\n{original_model_scores}')
print(f'INSTRUCT MODEL RESULTS :\n{instruct_model_scores}')

ORIGINAL MODEL RESULTS :
{'rouge1': 0.3101851851851852, 'rouge2': 0.08000000000000002, 'rougeL': 0.27314814814814814, 'rougeLsum': 0.27314814814814814}
INSTRUCT MODEL RESULTS :
{'rouge1': 0.43788041532402433, 'rouge2': 0.18220502901353966, 'rougeL': 0.288936627282492, 'rougeLsum': 0.288936627282492}


In [18]:
import pandas as pd
results = pd.read_csv("https://docs.google.com/spreadsheets/d/1mkRUGT5N4WFFOvysfGgCXo2ZYAZe8qVWWQIEtaCU5DQ/export?format=csv")
results.head()

Unnamed: 0.1,Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,0,Ms. Dawson helps #Person1# to write a memo to ...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
1,1,In order to prevent employees from wasting tim...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
2,2,Ms. Dawson takes a dictation for #Person1# abo...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
3,3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic again. #Person1...,#Person2# got stuck in traffic and got stuck i...
4,4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic again. #Person1...,#Person2# got stuck in traffic and got stuck i...


In [19]:
results.shape

(1500, 5)

In [20]:
summaries = results.human_baseline_summaries.values
original_model_summaries = results.original_model_summaries.values
instruct_model_summaries = results.instruct_model_summaries.values


original_model_scores = rouge.compute(
    predictions = original_model_summaries,
    references = summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

instruct_model_scores = rouge.compute(
    predictions = instruct_model_summaries,
    references = summaries[0:len(instruct_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

print(f'ORIGINAL MODEL RESULTS :\n{original_model_scores}')
print(f'INSTRUCT MODEL RESULTS :\n{instruct_model_scores}')

ORIGINAL MODEL RESULTS :
{'rouge1': 0.23322885043181377, 'rouge2': 0.07600361713497218, 'rougeL': 0.20114846015422058, 'rougeLsum': 0.2015983575552877}
INSTRUCT MODEL RESULTS :
{'rouge1': 0.4212788823794692, 'rouge2': 0.18008655721611452, 'rougeL': 0.33820027705838673, 'rougeLsum': 0.3383688277552966}


In [21]:
improvement = ((np.array(list(instruct_model_scores.values())))-(np.array(list(original_model_scores.values()))))

for key,value in zip(instruct_model_scores.keys(),improvement):
  print(f'{key} : {value*100:.2f}%')

rouge1 : 18.81%
rouge2 : 10.41%
rougeL : 13.71%
rougeLsum : 13.68%


In [27]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM
)

In [28]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters : 3538944
all model parameter : 251116800
percentage of trainable model parameters : 1.41%


In [29]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 1,
    logging_steps = 1,
    max_steps = 1
)

peft_trainer = Trainer(model = peft_model,
                       args = peft_training_args,
                       train_dataset = tokenized_datasets["train"])

In [30]:
%%time
peft_trainer.train()

peft_model_path = "./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



KeyboardInterrupt: 

In [32]:
!aws s3 cp --recursive s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/ ./peft-dialogue-summary-checkpoint-from-s3/

Completed 334 Bytes/15.9 MiB (370 Bytes/s) with 5 file(s) remainingdownload: s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/adapter_config.json to peft-dialogue-summary-checkpoint-from-s3/adapter_config.json
Completed 334 Bytes/15.9 MiB (370 Bytes/s) with 4 file(s) remainingCompleted 2.5 KiB/15.9 MiB (2.6 KiB/s) with 4 file(s) remaining    download: s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/special_tokens_map.json to peft-dialogue-summary-checkpoint-from-s3/special_tokens_map.json
Completed 2.5 KiB/15.9 MiB (2.6 KiB/s) with 3 file(s) remainingCompleted 4.9 KiB/15.9 MiB (5.1 KiB/s) with 3 file(s) remainingdownload: s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/tokenizer_config.json to peft-dialogue-summary-checkpoint-from-s3/tokenizer_config.json
download: s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/tokenizer.json to peft-dialogue-summary-checkpoint-from-s3/tokenizer.json
download: s3://dlai-generative-ai/

In [33]:
!ls -al ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin

-rw-r--r-- 1 root root 14208525 May 15  2023 ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin


In [34]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-from-s3/',
                                       torch_dtype = torch.bfloat16,
                                       is_trainable = False)

In [35]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters : 0
all model parameter : 251116800
percentage of trainable model parameters : 0.00%


In [39]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation :

{dialogue}

Summary :
"""

generate_config = GenerationConfig(max_new_tokens = 200, num_beams = 1)
input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids

original_model_output = tokenizer.decode(
                      original_model.generate(input_ids,
                                              generation_config = generate_config)[0],
                      skip_special_tokens = True
)

instruct_model_output = tokenizer.decode(
                      instruct_model.generate(input_ids,
                                              generation_config = generate_config)[0],
                      skip_special_tokens = True
)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_output}\n')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_output}\n')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_text_output}\n')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation :

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary :

-----------------------------------------------------------------