<a href="https://colab.research.google.com/github/mertcan-basut/nlp/blob/main/generative_ai_and_llms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install \
  datasets \
  accelerate -U \
  evaluate \
  rouge_score \
  peft \
  loralib --quiet

import time

import pandas as pd

import datasets
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
data = datasets.load_dataset("knkarthick/dialogsum")

In [None]:
example_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))

for index in example_indices:
  sample = data['test'][index]

  print(dash_line)
  print('Example ' + str(sample['id']))
  print(dash_line)

  print('INPUT DIALOGUE:')
  print(sample['dialogue'])
  print(dash_line)

  print('BASELINE HUMAN SUMMARY:')
  print(sample['summary'])
  print(dash_line)
  print()

---------------------------------------------------------------------------------------------------
Example test_13_2
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=True)

In [None]:
# in-context learning

def zero_shot_inference(sample):
  prompt = f"""
Dialogue:

{sample['dialogue']}

What was going on?
"""
  return prompt

def one_shot_inference(sample_target, sample_example):
  prompt = f"""
Dialogue:

{sample_example['dialogue']}

What was going on?
{sample_example['summary']}



Dialogue:

{sample_target['dialogue']}

What was going on?
"""
  return prompt

def few_shot_inference(sample_target, samples_example):
  prompt = ""
  for sample_example in samples_example:
    prompt += f"""
Dialogue:

{sample_example['dialogue']}

What was going on?
{sample_example['summary']}



"""
  prompt += f"""
Dialogue:

{sample_target['dialogue']}

What was going on?
"""
  return prompt

In [None]:
target_indices = [200]
example_indices = [40, 80, 120]
dash_line = '-'.join('' for x in range(100))

generation_config = GenerationConfig(
    max_new_tokens=200,
    do_sample=True,
    temperature=1.0,
    num_beams=1
)

for index in target_indices:
  sample = data['test'][index]
  prompt = few_shot_inference(sample, data['test'].select(example_indices))
  summary = sample['summary']

  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(inputs['input_ids'], generation_config=generation_config)[0],
      skip_special_tokens=True
  )

  print(dash_line)
  print('Example ' + str(sample['id']))
  print(dash_line)

  print(f'INPUT PROMPT:\n{prompt}')
  print(dash_line)

  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(dash_line)

  print(f'MODEL GENERATION:\n{output}\n')

Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
Example test_66_3
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.




Dialogue:

#Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No,

In [None]:
# instruction prompts
def tokenize_function(batch):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompts = [start_prompt + dialogue + end_prompt for dialogue in batch['dialogue']]

  batch['input_ids'] = tokenizer(prompts, padding='max_length', truncation=True, return_tensors='pt')['input_ids']
  batch['labels'] = tokenizer(batch['summary'], padding='max_length', truncation=True, return_tensors='pt')['input_ids']

  return batch

tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data = tokenized_data.remove_columns(['id', 'topic', 'dialogue', 'summary'])

tokenized_data = tokenized_data.filter(lambda sample, index: index % 100 == 0, with_indices=True) # subsample the dataset

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# full fine-tuning
output_dir = f"./dialogue-summary-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation']
)

trainer.train()

In [None]:
# parameter efficient fine-tuning (PEFT) / prompt tuning

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(model, lora_config)

output_dir = f"./peft-dialogue-summary-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_data["train"]
)

trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
peft_model = PeftModel.from_pretrained(
    model,
    peft_model_path,
    torch_dtype=torch.bfloat16,
    is_trainable=False
)

In [None]:
index = 200
sample = data['test'][index]

prompt = f"""
Summarize the following conversation.

{sample['dialogue']}

Summary:
"""

inputs = tokenizer(prompt, return_tensors="pt")
output = tokenizer.decode(
    model.generate(inputs['input_ids'], generation_config=generation_config)[0],
    skip_special_tokens=True
)

print(dash_line)
print(f"BASELINE HUMAN SUMMARY:\n{sample['summary']}")
print(dash_line)
print(f"INSTRUCT MODEL SUMMARY:\n{output}")

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL SUMMARY:
Person1: I've discussed what kind of hardware you want to upgrade to.


In [None]:
data_subset = data['test'][:10]

dialogues = data_subset['dialogue']
summaries = data_subset['summary']

model_summaries = []

for dialogue in dialogues:
  prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
  inputs = tokenizer(prompt, return_tensors="pt")
  output = tokenizer.decode(
      model.generate(inputs['input_ids'], generation_config=generation_config)[0],
      skip_special_tokens=True
  )
  model_summaries.append(output)

df = pd.DataFrame(data={'human-baseline-summary': summaries, 'model-summary': model_summaries})
df.head()

In [None]:
rouge = evaluate.load('rouge')

In [None]:
model_results = rouge.compute(
    predictions=df['model-summary'].values,
    references=df['human-baseline-summary'].values,
    use_aggregator=True,
    use_stemmer=True
)

print(model_results)

{'rouge1': 0.20335338881391513, 'rouge2': 0.09349766573295985, 'rougeL': 0.18996682411156096, 'rougeLsum': 0.18653752436647172}
