# Install Required Packages

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m

# Import Packages

In [3]:
from datasets import load_dataset
import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
import os
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import random
import nltk
import evaluate

# Define Environment Variables

In [4]:
os.environ['WANDB_DISABLED']="true"

# Load Dataset

In [5]:
dataset = load_dataset("microsoft/ms_marco", "v2.1", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/101093 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/808731 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/101092 [00:00<?, ? examples/s]

In [6]:
dataset = dataset.select(range(60))

# Preprocess Queries String

In [7]:
def clean_text(text):
    return text.strip().lower()

In [8]:
dataset = dataset.map(lambda example: {
    'query': clean_text(example['query']),
    'answers': [clean_text(ans) for ans in example['answers']]
}, remove_columns=dataset.column_names)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

# Ranking With Query Complexity

In [9]:
def compute_query_difficulty(example):
  query = example['query']

  word_count = len(query.split())
  punctuation_count = sum(1 for c in query if c in [',', '.', '?', '!', ':', ';'])

  length_score = len(query)

  difficulty = word_count + punctuation_count + (length_score / 50)

  example['difficulty'] = difficulty

  return example

In [10]:
dataset = dataset.map(compute_query_difficulty)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [11]:
dataset = dataset.sort("difficulty")

# Spliting Train And Eval Dataset

In [12]:
train_dataset = dataset.select(range(40))

In [13]:
eval_dataset = dataset.select(range(40, 60))

# Configure Quantization

In [14]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

# Download Model From Hugging Face

In [15]:
model_name='google/flan-t5-base'

In [16]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
  )

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Define Embedding

In [17]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
  )

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [18]:
tokenizer.model_max_length = 128
tokenizer.pad_token = tokenizer.eos_token

# Model Output Compare Function

In [22]:
def generate_and_compare(
    model, tokenizer, prompt, summary, length=100,
    prefix="Instruct: Refine this user search query.",
    temperature=0.8, top_p=0.95
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    formatted_prompt = f"{prefix}\n{prompt}"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, max_length=length,
        do_sample=True, top_p=top_p, temperature=temperature
    )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    dash_line = '-' * 100
    print(dash_line)
    print(f'INPUT PROMPT:\n{formatted_prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN ANSWER:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{decoded_output}')
    print(dash_line)

    return decoded_output

In [23]:
%%time

prompt = dataset[2]['query']
summary = dataset[2]['answers'][0]

test_output = generate_and_compare(
    model=original_model,
    tokenizer=tokenizer,
    prompt=prompt,
    summary=summary,
)

test_output

----------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Refine this user search query.
nyu tuition cost
----------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
$43,746 for the 2014-2015 academic year.

----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
<pad> Nyu tuition cost
----------------------------------------------------------------------------------------------------
CPU times: user 277 ms, sys: 1.93 ms, total: 279 ms
Wall time: 298 ms


'<pad> Nyu tuition cost'

# Prompt Engineering

In [24]:
def create_prompt_formats(sample):
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

    prompt_templates = [
      "### Instruct: Refine this user search query:",
      "### Task: Improve the clarity of the following search query:",
      "### Instruction: Fix the grammar and phrasing of this e-commerce search input:",
      "### Command: Clean up this product search term:",
      "### Request: Make this user query more natural and readable:",
      "### Action: Rephrase this customer search for better understanding:",
    ]

    instruction = random.choice(prompt_templates) + f"\n{sample['query'].strip()}"
    target = sample['answers'][0].strip()

    return {
        "input": f"{INTRO_BLURB}\n\n{instruction}",
        "target": f"{target}"
    }

# Get Model Max Length

In [25]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

# Process Text To Embedding

In [26]:
def preprocess_batch(batch, tokenizer, max_length):
    model_inputs = tokenizer(
        batch["input"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_attention_mask=True
    )

    labels = tokenizer(
        batch["target"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [27]:
max_length = get_max_length(original_model)

Found max lenth: 512


# Process Dataset For Model

In [28]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)

    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
    )

    dataset = dataset.remove_columns(['answers', 'query', 'difficulty', "input", "target"])

    return dataset

In [29]:
train_dataset = preprocess_dataset(tokenizer, 128, train_dataset)

Preprocessing dataset...


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [30]:
eval_dataset = preprocess_dataset(tokenizer, 128, eval_dataset)

Preprocessing dataset...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Config Low Rank Adaptation

In [31]:
original_model = prepare_model_for_kbit_training(original_model)

In [32]:
config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q", "v"],
    bias="none",
    lora_dropout=0.0,
    task_type="SEQ_2_SEQ_LM",
)

In [33]:
config.inference_mode = False
original_model.gradient_checkpointing_enable()

In [34]:
peft_model = get_peft_model(original_model, config)

In [35]:
peft_model.config.use_cache = False
peft_model.config.pretraining_tp = 1
peft_model.generation_config.pad_token_id = tokenizer.pad_token_id
peft_model.generation_config.eos_token_id = tokenizer.eos_token_id
peft_model.config.max_length = 128

# Compare QLoRA Model Parameter

In [36]:
def print_number_of_trainable_model_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params}")
    print(f"All params: {all_params}")
    print(f"Trainable%: {100 * trainable_params / all_params:.2f}%")

In [37]:
print_number_of_trainable_model_parameters(peft_model)

Trainable params: 442368
All params: 167804160
Trainable%: 0.26%


# Define Model Training Arguments

In [38]:
output_dir = f'./peft-flan-t5-training-{str(int(time.time()))}'

In [39]:
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    max_steps=50,
    learning_rate=1e-4,
    fp16=False,
    optim="paged_adamw_8bit",
    logging_steps=5,
    eval_steps=5,
    save_total_limit=1,
    gradient_checkpointing=True,
    dataloader_num_workers=2,
    remove_unused_columns=True,
    do_eval=True,
    eval_strategy="steps",
    load_best_model_at_end=True,
    disable_tqdm=False
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Model Data Collection

In [40]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=original_model,
)

# Making Traing Callbacks

In [41]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.1,
    )

# Model Evaluation Metrics

In [42]:
def compute_metrics(eval_preds):
    rouge = evaluate.load('rouge')
    bleu = evaluate.load('bleu')
    meteor = evaluate.load('meteor')

    preds = eval_preds.predictions
    labels = eval_preds.label_ids

    if isinstance(preds, tuple):
      preds = preds[0]

    pred_ids = np.argmax(preds, axis=-1)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_results = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    rouge_results = {k: round(v * 100, 4) for k, v in rouge_results.items()}

    bleu_results = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )
    bleu_results = {'bleu': round(bleu_results['bleu'] * 100, 4)}

    meteor_results = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    meteor_results = {'meteor': round(meteor_results['meteor'] * 100, 4)}

    metrics = {
        **rouge_results,
        **bleu_results,
        **meteor_results
    }

    return metrics

# Define Model Trainer

In [43]:
peft_trainer = Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# Fitting The Model

In [44]:
peft_trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor
5,4.3988,4.749502,9.8272,2.22,8.365,8.4785,0.3091,5.291
10,3.6883,4.535947,10.0281,2.2747,8.5759,8.6935,0.3006,5.5168
15,3.6092,4.281003,9.1584,2.0092,7.8268,7.8665,0.2826,5.1012
20,3.7927,4.015524,8.8615,1.7115,7.8632,7.9229,0.2626,4.8538
25,3.2555,3.74932,8.8292,1.6915,7.4492,7.4751,0.257,4.6608
30,3.0913,3.50386,9.2017,1.731,7.6547,7.6734,0.236,4.4842
35,3.3039,3.309818,9.2316,1.7535,7.6366,7.5792,0.2375,4.6907
40,2.9514,3.167719,9.5369,1.8326,7.7948,7.7204,0.2316,4.8773
45,2.6865,3.083063,9.5234,1.8655,7.468,7.3464,0.2461,5.0179
50,3.056,3.04946,9.5435,1.9286,7.653,7.5011,0.3235,5.0705


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[n

TrainOutput(global_step=50, training_loss=3.3833750915527343, metrics={'train_runtime': 323.4153, 'train_samples_per_second': 2.474, 'train_steps_per_second': 0.155, 'total_flos': 115267539566592.0, 'train_loss': 3.3833750915527343, 'epoch': 16.8})

# Save Model And Embedding

In [45]:
peft_model.save_pretrained("model-fine-tune")

In [46]:
tokenizer.save_pretrained("token-fine-tune")

('token-fine-tune/tokenizer_config.json',
 'token-fine-tune/special_tokens_map.json',
 'token-fine-tune/spiece.model',
 'token-fine-tune/added_tokens.json')

# Download And Evaluate The Model

In [47]:
model = peft_model.from_pretrained(original_model, "model-fine-tune")



In [48]:
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=4, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=4, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_

In [49]:
tokenizer = AutoTokenizer.from_pretrained("token-fine-tune")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [66]:
prompt = "new dress for new girl"

test_output = generate_and_compare(
    model=original_model,
    tokenizer=tokenizer,
    prompt=prompt,
    summary="",
)

test_output

----------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Refine this user search query.
new dress for new girl
----------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:


----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
i need a new dress for new girl
----------------------------------------------------------------------------------------------------


'i need a new dress for new girl'