# Install Required Packages

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m61.4/62.0 kB[0m [31m134.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m61.4/62.0 kB[0m [31m134.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m552.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m

# Import Packages

In [2]:
from datasets import load_dataset
import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
import os
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import random
import nltk
import evaluate
import re
import unicodedata

# Define Environment Variables

In [3]:
os.environ['WANDB_DISABLED']="true"

# Load Dataset

In [4]:
dataset = load_dataset("microsoft/ms_marco", "v2.1", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/101093 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/808731 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/101092 [00:00<?, ? examples/s]

In [5]:
dataset = dataset.select(range(100))

# Preprocess Queries String

In [6]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    text = unicodedata.normalize('NFKC', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'http\S+|www\S+', '', text)

    return text

In [7]:
dataset = dataset.map(lambda example: {
    'query': preprocess_text(example['query']),
    'answers': [preprocess_text(ans) for ans in example['answers']]
}, remove_columns=dataset.column_names)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Ranking With Query Complexity

In [8]:
def compute_query_difficulty(example):
  query = example['query']

  word_count = len(query.split())
  punctuation_count = sum(1 for c in query if c in [',', '.', '?', '!', ':', ';'])

  length_score = len(query)

  difficulty = word_count + punctuation_count + (length_score / 50)

  example['difficulty'] = difficulty

  return example

In [9]:
dataset = dataset.map(compute_query_difficulty)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.sort("difficulty")

# Spliting Train And Eval Dataset

In [11]:
train_dataset = dataset.select(range(70))

In [12]:
eval_dataset = dataset.select(range(70, 100))

# Configure Quantization

In [13]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_quant_storage=torch.uint8,
        llm_int8_skip_modules = ["lm_head", "encoder.embed_tokens"]
    )

# Download Model From Hugging Face

In [14]:
model_name='google/flan-t5-base'

In [15]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    use_cache=False,
    attn_implementation="eager",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    revision="main",
    token=None,
    force_download=False,
    local_files_only=False
  )

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Define Embedding

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=False,
    truncation_side="right",
    pad_token="<pad>",
    eos_token="</s>",
    additional_special_tokens=["<user_query>", "<reformulated>"],
    max_length=128,
    padding="max_length",
    truncation=True,
    legacy=False,
    clean_up_tokenization_spaces=True,
    strip_accents=False,
    lowercase=False
  )

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

# Model Output Compare Function

In [17]:
def generate_and_compare(
    model, tokenizer, prompt, summary, length=100,
    prefix="Instruct: Refine this user search query.",
    temperature=0.8, top_p=0.95
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    formatted_prompt = f"{prefix}\n{prompt}"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, max_length=length,
        do_sample=True, top_p=top_p, temperature=temperature
    )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    dash_line = '-' * 100
    print(dash_line)
    print(f'INPUT PROMPT:\n{formatted_prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN ANSWER:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{decoded_output}')
    print(dash_line)

    return decoded_output

In [18]:
%%time

prompt = dataset[2]['query']
summary = dataset[2]['answers'][0]

test_output = generate_and_compare(
    model=original_model,
    tokenizer=tokenizer,
    prompt=prompt,
    summary=summary,
)

test_output

----------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Refine this user search query.
what dba mean
----------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
dba means doing business as is a term that is used in todays business industry in regards to the liability and formal actions that are taken by a particular company

----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
DBA (DBA) is a acronym of the American Association of Industrial Automation.
----------------------------------------------------------------------------------------------------
CPU times: user 1.03 s, sys: 205 ms, total: 1.24 s
Wall time: 1.89 s


'DBA (DBA) is a acronym of the American Association of Industrial Automation.'

# Prompt Engineering

In [19]:
def create_prompt_formats(sample):
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

    prompt_templates = [
      "### Instruct: Refine this user search query:",
      "### Task: Improve the clarity of the following search query:",
      "### Instruction: Fix the grammar and phrasing of this e-commerce search input:",
      "### Command: Clean up this product search term:",
      "### Request: Make this user query more natural and readable:",
      "### Action: Rephrase this customer search for better understanding:",
    ]

    instruction = random.choice(prompt_templates) + f"\n{sample['query'].strip()}"
    target = sample['answers'][0].strip()

    return {
        "input": f"{INTRO_BLURB}\n\n{instruction}",
        "target": f"{target}"
    }

# Get Model Max Length

In [20]:
def get_max_query_length(dataset):
    """
    Calculate the 90th percentile of user query lengths in a dataset.

    Args:
        dataset: Hugging Face Dataset with "query" field

    Returns:
        length_90th_percentile: Length at the 90th percentile (characters)
    """
    lengths = []

    # Collect lengths of all queries
    for example in dataset:
        query = example["query"].strip()
        lengths.append(len(query))

    # Compute the 90th percentile
    length_90th_percentile = int(np.percentile(lengths, 90))
    return length_90th_percentile

In [21]:
max_length = get_max_query_length(dataset)
max_length

49

# Process Text To Embedding

In [22]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Preprocess batch for seq2seq training with enhanced configurations

    Args:
        batch: Dictionary with "input" and "target" texts
        tokenizer: HF tokenizer configured for the model
        max_length: Maximum sequence length for truncation/padding

    Returns:
        Dictionary containing model inputs and labels
    """

    model_inputs = tokenizer(
        batch["input"],
        max_length=max_length,
        truncation=True,
        truncation_strategy='longest_first',
        padding="max_length",
        return_tensors="pt",
        return_token_type_ids=False,
        add_special_tokens=True,
        verbose=False
    )

    labels = tokenizer(
        batch["target"],
        max_length=max_length,
        truncation=True,
        padding="max_length",
        add_special_tokens=False,
        return_attention_mask=False,
        return_tensors="pt"
    )

    labels["input_ids"][labels["input_ids"] == tokenizer.pad_token_id] = -100

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Process Dataset For Model

In [23]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)

    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
    )

    dataset = dataset.remove_columns(['answers', 'query', 'difficulty', "input", "target"])

    return dataset

In [24]:
train_dataset = preprocess_dataset(tokenizer, max_length, train_dataset)

Preprocessing dataset...


Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [25]:
eval_dataset = preprocess_dataset(tokenizer, 128, eval_dataset)

Preprocessing dataset...


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

# Config Low Rank Adaptation

In [26]:
original_model = prepare_model_for_kbit_training(original_model)

In [27]:
config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q", "v", "k", "o"],
    bias="lora_only",
    lora_dropout=0.1,
    fan_in_fan_out=False,
    rank_pattern={"q": 8, "v": 4},
    alpha_pattern={"q": 32, "v": 16},
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False
)

In [28]:
peft_model = get_peft_model(original_model, config)

# Compare QLoRA Model Parameter

In [29]:
def print_number_of_trainable_model_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params}")
    print(f"All params: {all_params}")
    print(f"Trainable%: {100 * trainable_params / all_params:.2f}%")

In [30]:
print_number_of_trainable_model_parameters(peft_model)

Trainable params: 2433024
All params: 169794816
Trainable%: 1.43%


# Define Model Training Arguments

In [31]:
output_dir = f'./peft-flan-t5-training-{str(int(time.time()))}'

In [32]:
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,

    num_train_epochs=1,
    max_steps=50,

    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    fp16=False,

    dataloader_num_workers=4,
    dataloader_pin_memory=False,
    eval_accumulation_steps=2,

    logging_steps=10,
    save_steps=50,
    save_total_limit=2,

    remove_unused_columns=True,
    report_to="all",

    eval_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,

    torch_compile=False,
    lr_scheduler_type="cosine"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Model Data Collection

In [33]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=original_model,
)

# Making Traing Callbacks

In [34]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.1,
    )

# Model Evaluation Metrics

In [35]:
def compute_metrics(eval_preds):
    rouge = evaluate.load('rouge')
    bleu = evaluate.load('bleu')
    meteor = evaluate.load('meteor')

    preds = eval_preds.predictions
    labels = eval_preds.label_ids

    if isinstance(preds, tuple):
      preds = preds[0]

    pred_ids = np.argmax(preds, axis=-1)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_results = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    rouge_results = {k: round(v * 100, 4) for k, v in rouge_results.items()}

    bleu_results = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )
    bleu_results = {'bleu': round(bleu_results['bleu'] * 100, 4)}

    meteor_results = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    meteor_results = {'meteor': round(meteor_results['meteor'] * 100, 4)}

    metrics = {
        **rouge_results,
        **bleu_results,
        **meteor_results
    }

    return metrics

# Define Model Trainer

In [36]:
peft_trainer = Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# Fitting The Model

In [44]:
peft_trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor
10,4.3875,,7.6584,2.4648,6.7747,6.7273,0.902,7.2041
20,4.688,,7.6584,2.4648,6.7747,6.7273,0.902,7.2041
30,4.5759,,7.6584,2.4648,6.7747,6.7273,0.902,7.2041


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_

KeyboardInterrupt: 

# Save Model And Embedding

In [38]:
peft_model.save_pretrained("model-fine-tune")

In [39]:
tokenizer.save_pretrained("token-fine-tune")

('token-fine-tune/tokenizer_config.json',
 'token-fine-tune/special_tokens_map.json',
 'token-fine-tune/spiece.model',
 'token-fine-tune/added_tokens.json')

# Download And Evaluate The Model

In [40]:
model = peft_model.from_pretrained(original_model, "model-fine-tune")



In [41]:
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
       

In [42]:
tokenizer = AutoTokenizer.from_pretrained("token-fine-tune")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [43]:
prompt = "new dress for new girl"

test_output = generate_and_compare(
    model=original_model,
    tokenizer=tokenizer,
    prompt=prompt,
    summary="",
)

test_output

----------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Refine this user search query.
new dress for new girl
----------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:


----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
new dress for new girl
----------------------------------------------------------------------------------------------------


'new dress for new girl'