<a href="https://colab.research.google.com/github/mertcan-basut/nlp/blob/main/generative_ai_and_llms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install \
  datasets \
  accelerate -U \
  evaluate \
  rouge_score \
  peft \
  loralib \
  git+https://github.com/lvwerra/trl.git@25fa1bd --quiet

[0m  Preparing metadata (setup.py) ... [?25l[?25hdone


In [18]:
import time

import pandas as pd
import numpy as np

import datasets
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, GenerationConfig, TrainingArguments, Trainer, pipeline
import evaluate
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, create_reference_model

In [None]:
data = datasets.load_dataset("knkarthick/dialogsum")

In [None]:
example_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))

for index in example_indices:
  sample = data['test'][index]

  print(dash_line)
  print('Example ' + str(sample['id']))
  print(dash_line)

  print('INPUT DIALOGUE:')
  print(sample['dialogue'])
  print(dash_line)

  print('BASELINE HUMAN SUMMARY:')
  print(sample['summary'])
  print(dash_line)
  print()

---------------------------------------------------------------------------------------------------
Example test_13_2
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", use_fast=True)

In [None]:
# in-context learning

def zero_shot_inference(sample):
  prompt = f"""
Dialogue:

{sample['dialogue']}

What was going on?
"""
  return prompt

def one_shot_inference(sample_target, sample_example):
  prompt = f"""
Dialogue:

{sample_example['dialogue']}

What was going on?
{sample_example['summary']}



Dialogue:

{sample_target['dialogue']}

What was going on?
"""
  return prompt

def few_shot_inference(sample_target, samples_example):
  prompt = ""
  for sample_example in samples_example:
    prompt += f"""
Dialogue:

{sample_example['dialogue']}

What was going on?
{sample_example['summary']}



"""
  prompt += f"""
Dialogue:

{sample_target['dialogue']}

What was going on?
"""
  return prompt

In [None]:
target_indices = [200]
example_indices = [40, 80, 120]
dash_line = '-'.join('' for x in range(100))

generation_config = GenerationConfig(
    max_new_tokens=200,
    do_sample=True,
    temperature=1.0,
    num_beams=1
)

for index in target_indices:
  sample = data['test'][index]
  prompt = few_shot_inference(sample, data['test'].select(example_indices))
  summary = sample['summary']

  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(inputs['input_ids'], generation_config=generation_config)[0],
      skip_special_tokens=True
  )

  print(dash_line)
  print('Example ' + str(sample['id']))
  print(dash_line)

  print(f'INPUT PROMPT:\n{prompt}')
  print(dash_line)

  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(dash_line)

  print(f'MODEL GENERATION:\n{output}\n')

Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
Example test_66_3
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.




Dialogue:

#Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No,

In [None]:
# instruction prompts
def tokenize_function(batch):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompts = [start_prompt + dialogue + end_prompt for dialogue in batch['dialogue']]

  batch['input_ids'] = tokenizer(prompts, padding='max_length', truncation=True, return_tensors='pt')['input_ids']
  batch['labels'] = tokenizer(batch['summary'], padding='max_length', truncation=True, return_tensors='pt')['input_ids']

  return batch

tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data = tokenized_data.remove_columns(['id', 'topic', 'dialogue', 'summary'])

tokenized_data = tokenized_data.filter(lambda sample, index: index % 100 == 0, with_indices=True) # subsample the dataset

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# full fine-tuning
output_dir = f"./dialogue-summary-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation']
)

trainer.train()

In [None]:
# parameter efficient fine-tuning (PEFT) / prompt tuning

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(model, lora_config)

output_dir = f"./peft-dialogue-summary-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_data["train"]
)

trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
peft_model = PeftModel.from_pretrained(
    model,
    peft_model_path,
    torch_dtype=torch.bfloat16,
    is_trainable=False
)

In [None]:
index = 200
sample = data['test'][index]

prompt = f"""
Summarize the following conversation.

{sample['dialogue']}

Summary:
"""

inputs = tokenizer(prompt, return_tensors="pt")
output = tokenizer.decode(
    model.generate(inputs['input_ids'], generation_config=generation_config)[0],
    skip_special_tokens=True
)

print(dash_line)
print(f"BASELINE HUMAN SUMMARY:\n{sample['summary']}")
print(dash_line)
print(f"INSTRUCT MODEL SUMMARY:\n{output}")

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL SUMMARY:
Person1: I've discussed what kind of hardware you want to upgrade to.


In [None]:
data_subset = data['test'][:10]

dialogues = data_subset['dialogue']
summaries = data_subset['summary']

model_summaries = []

for dialogue in dialogues:
  prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
  inputs = tokenizer(prompt, return_tensors="pt")
  output = tokenizer.decode(
      model.generate(inputs['input_ids'], generation_config=generation_config)[0],
      skip_special_tokens=True
  )
  model_summaries.append(output)

df = pd.DataFrame(data={'human-baseline-summary': summaries, 'model-summary': model_summaries})
df.head()

In [None]:
rouge = evaluate.load('rouge')

In [None]:
model_results = rouge.compute(
    predictions=df['model-summary'].values,
    references=df['human-baseline-summary'].values,
    use_aggregator=True,
    use_stemmer=True
)

print(model_results)

{'rouge1': 0.20335338881391513, 'rouge2': 0.09349766573295985, 'rougeL': 0.18996682411156096, 'rougeLsum': 0.18653752436647172}


In [20]:
def build_dataset(model_name, dataset_name, input_min_text_length, input_max_text_length):
  # load dataset (only "train" part)
  dataset = datasets.load_dataset(dataset_name, split="train")

  # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
  dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

  # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
  tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

  def tokenize(sample):
    # Wrap each dialogue with the instruction.
    prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
    sample["input_ids"] = tokenizer.encode(prompt)
    # This must be called "query", which is a requirement of our PPO library.
    sample["query"] = tokenizer.decode(sample["input_ids"])
    return sample

  # Tokenize each dialogue.
  dataset = dataset.map(tokenize, batched=False)
  dataset.set_format(type="torch")

  # Split the dataset into train and test parts.
  dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

  return dataset_splits

dataset = build_dataset(
    model_name="google/flan-t5-base",
    dataset_name="knkarthick/dialogsum",
    input_min_text_length=200,
    input_max_text_length=1000
)

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", device_map="auto")

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = PeftModel.from_pretrained(
    model,
    "./peft-dialogue-summary-checkpoint-local",
    lora_config=lora_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    is_trainable=True
)

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    peft_model,
    torch_dtype=torch.bfloat16,
    is_trainable=True
)

ref_model = create_reference_model(ppo_model)

In [12]:
toxicity_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target", device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target", device_map="auto")
print(toxicity_model.config.id2label)

text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."
input_ids = toxicity_tokenizer(text, return_tensors="pt").input_ids
logits = toxicity_model(input_ids=input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')
nothate_index = 0
nothate_reward = (logits[:, nothate_index]).tolist()
print(f'reward: {nothate_reward}')

{0: 'nothate', 1: 'hate'}
logits [not hate, hate]: [-0.6921183466911316, 0.37227246165275574]
probabilities [not hate, hate]: [0.2564712464809418, 0.7435287237167358]
reward: [-0.6921183466911316]


In [15]:
device = 0 if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="facebook/roberta-hate-speech-dynabench-r4-target",
    device=device
)

reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print( sentiment_pipe(text, **reward_logits_kwargs) )
print( sentiment_pipe(text, **reward_probabilities_kwargs) )

[{'label': 'hate', 'score': 0.37227246165275574}, {'label': 'nothate', 'score': -0.6921183466911316}]
[{'label': 'hate', 'score': 0.7435287237167358}, {'label': 'nothate', 'score': 0.25647127628326416}]


In [34]:
toxicity_evaluator = evaluate.load(
    "toxicity",
    "facebook/roberta-hate-speech-dynabench-r4-target",
    module_type="measurement",
    toxic_label="hate"
)

toxicity_score = toxicity_evaluator.compute(predictions=[text])
print(toxicity_score)

def evaluate_toxicity(model, toxicity_evaluator, tokenizer, dataset, num_samples):
  toxicities = []
  generation_config = GenerationConfig(
      max_new_tokens=100,
      top_k=0.0,
      top_p=1.0,
      do_sample=True
  )

  for i, sample in enumerate(dataset):
    if i > num_samples: break

    input_text = sample['query']
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids

    response_token_ids = model.generate(input_ids=input_ids, generation_config=generation_config)
    generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

    toxicity_score = toxicity_evaluator.compute(predictions=[input_text + generated_text])['toxicity']
    toxicities.extend(toxicity_score)

  mean = np.mean(toxicities)
  std = np.std(toxicities)
  return mean, std

mean, std = evaluate_toxicity(
    model=ref_model,
    toxicity_evaluator=toxicity_evaluator,
    tokenizer=tokenizer,
    dataset=dataset["test"],
    num_samples=10
)
print(f'toxicity [mean, std] before detox: [{mean}, {std}]')

{'toxicity': [0.7435287237167358]}
toxicity [mean, std] before detox: [0.020044648945754903, 0.021663029920102436]
