In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score

In [2]:
from datasets import load_dataset
import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    EarlyStoppingCallback,
    DataCollatorForSeq2Seq
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
import os
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import random
import evaluate

In [3]:
os.environ['WANDB_DISABLED']="true"

In [4]:
dataset = load_dataset("microsoft/ms_marco", "v2.1", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
dataset = dataset.select(range(200))

In [6]:
def clean_text(text):
    return text.strip().lower()

In [7]:
dataset = dataset.map(lambda example: {
    'query': clean_text(example['query']),
    'answers': [clean_text(ans) for ans in example['answers']]
}, remove_columns=dataset.column_names)

In [8]:
def compute_query_difficulty(example):
  query = example['query']

  word_count = len(query.split())
  punctuation_count = sum(1 for c in query if c in [',', '.', '?', '!', ':', ';'])

  length_score = len(query)

  difficulty = word_count + punctuation_count + (length_score / 50)

  example['difficulty'] = difficulty

  return example

In [9]:
dataset = dataset.map(compute_query_difficulty)

In [10]:
dataset = dataset.sort("difficulty")

In [11]:
train_dataset = dataset.select(range(80))

In [12]:
eval_dataset = dataset.select(range(80, 100))

In [13]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )


In [14]:
model_name='google/flan-t5-large'

device_map = {"": 0}

In [15]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
  )

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
  )

tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [17]:
def gen(model, prompt, length):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=length, do_sample=True, top_p=0.95, temperature=0.8)

    return [tokenizer.decode(outputs[0], skip_special_tokens=True)]

In [18]:
%%time

index = 2

prompt = dataset[index]['query']
summary = dataset[index]['answers'][0]

formatted_prompt = f"Instruct: Refine this user search query.\n{prompt}"

res = gen(original_model,formatted_prompt,100,)

output = res[0]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Refine this user search query.
define traumatic
---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
it is extremely upsetting or something that causes great harm or damage to the mind or body.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
<pad> definition of traumatic
CPU times: user 733 ms, sys: 200 ms, total: 933 ms
Wall time: 1.82 s


In [19]:
def create_prompt_formats(sample):
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

    prompt_templates = [
      "### Instruct: Refine this user search query:",
      "### Task: Improve the clarity of the following search query:",
      "### Instruction: Fix the grammar and phrasing of this e-commerce search input:",
      "### Command: Clean up this product search term:",
      "### Request: Make this user query more natural and readable:",
      "### Action: Rephrase this customer search for better understanding:",
    ]

    instruction = random.choice(prompt_templates) + f"\n{sample['query'].strip()}"
    target = sample['answers'][0].strip()

    return {
        "input": f"{INTRO_BLURB}\n\n{instruction}",
        "target": f"{target}"
    }

In [20]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [21]:
def preprocess_batch(batch, tokenizer, max_length):
    model_inputs = tokenizer(
        batch["input"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_attention_mask=True
    )

    labels = tokenizer(
        batch["target"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [22]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)

    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
    )

    dataset = dataset.remove_columns(['answers', 'query', 'difficulty', "input", "target"])

    return dataset

In [23]:
max_length = get_max_length(original_model)

Found max lenth: 512


In [24]:
train_dataset = preprocess_dataset(tokenizer, max_length, train_dataset)

Preprocessing dataset...


In [25]:
eval_dataset = preprocess_dataset(tokenizer, max_length, eval_dataset)

Preprocessing dataset...


In [26]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

In [27]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})

In [28]:
print(tokenizer.decode(train_dataset[0]['input_ids'], skip_special_tokens=True))

Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Fix the grammar and phrasing of this e-commerce search input: depona ab


In [29]:
original_model = prepare_model_for_kbit_training(original_model)

In [30]:
config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    bias="none",
    lora_dropout=0.05,
    task_type="SEQ_2_SEQ_LM",
)

In [31]:
original_model.gradient_checkpointing_enable()

In [32]:
peft_model = get_peft_model(original_model, config)

In [33]:
def print_number_of_trainable_model_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params}")
    print(f"All params: {all_params}")
    print(f"Trainable%: {100 * trainable_params / all_params:.2f}%")


In [34]:
print_number_of_trainable_model_parameters(peft_model)

Trainable params: 9437184
All params: 503180288
Trainable%: 1.88%


In [35]:
output_dir = f'./peft-flan-t5-training-{str(int(time.time()))}'

In [36]:
peft_model.config.use_cache = False

In [37]:
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    max_steps=5,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=5,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=5,
    eval_strategy="steps",
    eval_steps=5,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
    load_best_model_at_end=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    fp16=True
)

In [38]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0.0
    )

In [39]:
data_collator=DataCollatorForSeq2Seq(
    tokenizer,
    model=peft_model,
    padding="longest",
    pad_to_multiple_of=8
)

In [40]:
print("Sample processed batch:")
sample_batch = data_collator([train_dataset[0], train_dataset[1]])
print("Input shapes:", sample_batch["input_ids"].shape)
print("Label shapes:", sample_batch["labels"].shape)

Sample processed batch:
Input shapes: torch.Size([2, 512])
Label shapes: torch.Size([2, 512])


In [41]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

In [42]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    print(type(preds), type(preds[0]))
    print(preds[0])

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_result["bleu"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"]
    }


In [43]:
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=data_collator,
    # callbacks=[early_stopping],
    # compute_metrics=compute_metrics
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [44]:
peft_trainer.train()

Step,Training Loss,Validation Loss
5,0.0,


TrainOutput(global_step=5, training_loss=0.0, metrics={'train_runtime': 54.0214, 'train_samples_per_second': 1.481, 'train_steps_per_second': 0.093, 'total_flos': 186700718407680.0, 'train_loss': 0.0, 'epoch': 1.0})