### SFT finetune the `phi-2` model

- Fine-tune the `phi-2` model from Microsoft to better align with human preferences


In [1]:
# ! pip install -q datasets transformers bitsandbytes sentencepiece wandb

In [1]:
import os
import json
import gc
import torch

from dataclasses import dataclass

from datasets import load_dataset
import transformers
from transformers import pipeline
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import AutoPeftModelForCausalLM, PeftConfig
from peft import LoraConfig, PeftModel, get_peft_model, PeftModelForCausalLM
from trl import SFTTrainer, setup_chat_format

# import bitsandbytes as bnb
import wandb

In [2]:
#### basic config
base_model_name = "microsoft/phi-2"
modified_model_name = "phi2-sft-alpaca"


num_epochs = 1
# max_steps = 1000

secrets_path = "./secrets/secrets.json"
output_dir = f".models/adapters/{modified_model_name}_alignment-handbook"


run_name = f"{modified_model_name}-{num_epochs}_alignment-handbook"

In [3]:
# load the secrets

with open(secrets_path, "r") as f:
    secrets = json.load(f)

HF_TOKEN = secrets["HF_TOKEN"]
WANDB_TOKEN = secrets["WANDB_TOKEN"]

wandb.login(key=WANDB_TOKEN)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ostrich/.netrc


True

In [5]:
# Supervised fine-tune the model on the distillaplaca dataset

#### Download the alpaca data using wget
# ! wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O ./data/alpaca_data.json

In [6]:
# Convert the downloaded dataset into hugging face
train_dataset = load_dataset("data", data_files="alpaca_data.json", split="train")

In [7]:
train_dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 52002
})

In [8]:
# one example
train_dataset[0]

{'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.'}

In [9]:
# prepare the chatml template since we will be utilizing the chatml format primarily


def convert_chatml(example, tokenizer: AutoTokenizer, add_eos: bool = False):

    # system prompt
    ## Prepare the chatml message

    system_prompt = (
        {"role": "system", "content": example["input"]}
        if len(example["input"]) > 0
        else {
            "role": "system",
            "content": "You are Phi a friendly chat assistant that follows user instructions.",
        }
    )
    inst_prompt = {"role": "user", "content": example["instruction"]}
    assistant_prompt = {"role": "assistant", "content": example["output"]}

    if system_prompt is not None:
        message = [system_prompt, inst_prompt, assistant_prompt]
    else:
        message = [inst_prompt, assistant_prompt]

    return {"text": tokenizer.apply_chat_template(message, tokenize=False)}

### Load the tokenizer


In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    # add_eos_token=True
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


##### Look at a tokenization example


In [8]:
ex = "I had a great dream<|im_end|>"

encoded_ex = tokenizer(
    ex,
    # padding="max_length",
    truncation=True,
    add_special_tokens=True,
    max_length=2,
    return_overflowing_tokens=True

)
print(encoded_ex)

{'input_ids': [[40, 550], [257, 1049], [4320, 50296]], 'attention_mask': [[1, 1], [1, 1], [1, 1]], 'overflow_to_sample_mapping': [0, 0, 0]}


### Load the model


In [12]:
# define the quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type='nf4',
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)

model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# model.save_pretrained(base_model_name, max_shard_size="2GB", safe_serialization=True)

In [14]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiFlashAttention2(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_

In [15]:
model.lm_head.out_features

51200

### Expand the vocabulary, to fit the chat tokens [Optional]


In [16]:
@dataclass
class SpecialTokens:
    conversation_start_token: str = "<|im_start|>"
    conversation_end_token: str = "<|im_end|>"
    pad_token: str = "<|pad|>"


chat_format_tokens = SpecialTokens

# Add special tokens to the tokenizer
tokenizer.add_special_tokens(
    {
        "additional_special_tokens": [
            chat_format_tokens.conversation_start_token,
            chat_format_tokens.conversation_end_token,
            chat_format_tokens.pad_token,
        ]
    }
)

# resize the model embedding layers ???? or do we ?? .. Its already expanded
# if model.lm_head.out_features < len(tokenizer):
model.resize_token_embeddings(len(tokenizer))

Embedding(50298, 2560)

### Define the peft model config


In [17]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        # "embed_tokens.weight",
        # "lm_head.weight",
    ],
    modules_to_save=["embed_tokens", "lm_head"],
    task_type="CAUSAL_LM",
)

In [18]:
chat_format_tokens.pad_token

'<|pad|>'

### Prepare the data and the model


In [19]:
"""
CHAT_TEMPLATE = 
    {% if not add_generation_prompt is defined %}
        {% set add_generation_prompt=false %}
    {% endif %}
    {% for message in messages %}
        {{'<|im_start|>' + message['role'] + '\n' + message['content'] +  + '<|im_end|>' + '\n'}}
    {% endfor %}
    {% if add_generation_prompt %}
        {{'<|im_start|>assistant' + '\n'}}
    {% elif not add_generation_prompt %}
        {{eos_token}}
    {% endif %}
"""

CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant' + '\n'}}{% elif not add_generation_prompt %}{{eos_token}}{% endif %}"

tokenizer.chat_template = (
    CHAT_TEMPLATE  # set a new chat template (slightly different chatml template)
)
tokenizer.pad_token = chat_format_tokens.pad_token  # set the new pad token
tokenizer.padding_side = (
    "right"  # should ideally set to left but it causes some overflow issue
)
# tokenizer.truncation_side = "left"  # to avoid truncating latest generation

train_dataset = train_dataset.map(
    convert_chatml,
    remove_columns=train_dataset.column_names,
    fn_kwargs={"tokenizer": tokenizer},
)

In [20]:
# CHAT_TEMPLATE

In [21]:
train_dataset[2]

{'text': '<|im_start|>system\nYou are Phi a friendly chat assistant that follows user instructions.<|im_end|>\n<|im_start|>user\nDescribe the structure of an atom.<|im_end|>\n<|im_start|>assistant\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.<|im_end|>\n'}

In [22]:
##### Decide on the max seq len to use
import numpy as np

text_max_len = int(
    np.percentile(
        [len(tokenizer(example["text"])["input_ids"]) for example in train_dataset],
        99,
    ),
)
text_max_len


### The length for most of the examples in the alpaca dataset are less than 512

306

### Trainer initialization


In [24]:
### Trainer Args

trainer_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Avoids saving the whole graphs, computes the required activations.
    optim="paged_adamw_32bit",
    learning_rate=2e-5,
    max_grad_norm=0.3,  # Clips the gradnorm value to 0.3
    num_train_epochs=num_epochs,
    # max_steps=max_steps,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,  # Around 10% warmup Zephyr Recipie
    log_level="error",  # Avoid overthinking due to unnecessary logs.
    logging_strategy="steps",
    logging_steps=1,
    # evaluation_strategy="steps",
    # eval_steps=50,
    save_strategy="steps",
    save_steps=50,  # Save every evalutaion steps
    save_total_limit=3,
    tf32=True,  # TensorFloat32 dtype supported for GeForce 40 series
    bf16=True,
    report_to="wandb",
    run_name=run_name,
    disable_tqdm=True,
    # groupby_length=True,   # Groups examples of similar lengths together to save padding
)


### SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=trainer_args,
    train_dataset=train_dataset,
    # eval_dataset=train_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    peft_config=lora_config,
    packing=True,
    max_seq_length=1024,  # max len of the phi2 model
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False
    }
)


print(trainer.model.print_trainable_parameters())

trainable params: 273,304,698 || all params: 3,048,369,396 || trainable%: 8.965602999381378
None


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
result = trainer.train()
result

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


{'loss': 2.895, 'grad_norm': 492.0, 'learning_rate': 6.25e-07, 'epoch': 0.0}
{'loss': 3.0245, 'grad_norm': 386.0, 'learning_rate': 1.25e-06, 'epoch': 0.01}
{'loss': 3.0733, 'grad_norm': 358.0, 'learning_rate': 1.8750000000000003e-06, 'epoch': 0.01}
{'loss': 3.128, 'grad_norm': 352.0, 'learning_rate': 2.5e-06, 'epoch': 0.01}
{'loss': 3.186, 'grad_norm': 292.0, 'learning_rate': 3.125e-06, 'epoch': 0.02}
{'loss': 3.1209, 'grad_norm': 406.0, 'learning_rate': 3.7500000000000005e-06, 'epoch': 0.02}
{'loss': 3.2977, 'grad_norm': 316.0, 'learning_rate': 4.3750000000000005e-06, 'epoch': 0.02}
{'loss': 3.1761, 'grad_norm': 282.0, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 3.1725, 'grad_norm': 248.0, 'learning_rate': 5.625e-06, 'epoch': 0.03}
{'loss': 3.1725, 'grad_norm': 368.0, 'learning_rate': 6.25e-06, 'epoch': 0.03}
{'loss': 2.8646, 'grad_norm': 310.0, 'learning_rate': 6.875e-06, 'epoch': 0.04}
{'loss': 2.945, 'grad_norm': 636.0, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.04}
{'l



{'loss': 2.1144, 'grad_norm': 144.0, 'learning_rate': 1.9775233980110524e-05, 'epoch': 0.16}
{'loss': 2.2385, 'grad_norm': 84.0, 'learning_rate': 1.9751053008725736e-05, 'epoch': 0.17}
{'loss': 2.1322, 'grad_norm': 66.0, 'learning_rate': 1.9725653232699962e-05, 'epoch': 0.17}
{'loss': 2.1009, 'grad_norm': 85.0, 'learning_rate': 1.969903782680467e-05, 'epoch': 0.17}
{'loss': 2.2009, 'grad_norm': 61.25, 'learning_rate': 1.967121011775546e-05, 'epoch': 0.18}
{'loss': 2.1498, 'grad_norm': 51.0, 'learning_rate': 1.9642173583796265e-05, 'epoch': 0.18}
{'loss': 2.2042, 'grad_norm': 58.0, 'learning_rate': 1.961193185426459e-05, 'epoch': 0.18}
{'loss': 2.1578, 'grad_norm': 53.75, 'learning_rate': 1.958048870913786e-05, 'epoch': 0.19}
{'loss': 2.2291, 'grad_norm': 61.0, 'learning_rate': 1.9547848078560975e-05, 'epoch': 0.19}
{'loss': 1.9899, 'grad_norm': 48.0, 'learning_rate': 1.9514014042355057e-05, 'epoch': 0.19}
{'loss': 2.0433, 'grad_norm': 51.5, 'learning_rate': 1.9478990829507507e-05, 'epo



{'loss': 1.7384, 'grad_norm': 51.25, 'learning_rate': 1.7169192679698837e-05, 'epoch': 0.32}
{'loss': 1.7036, 'grad_norm': 25.0, 'learning_rate': 1.7090803880386784e-05, 'epoch': 0.33}
{'loss': 1.7882, 'grad_norm': 42.0, 'learning_rate': 1.701152878657197e-05, 'epoch': 0.33}
{'loss': 1.7328, 'grad_norm': 25.625, 'learning_rate': 1.693137730701524e-05, 'epoch': 0.33}
{'loss': 1.749, 'grad_norm': 39.5, 'learning_rate': 1.6850359460018737e-05, 'epoch': 0.33}
{'loss': 1.7743, 'grad_norm': 41.5, 'learning_rate': 1.6768485372173696e-05, 'epoch': 0.34}
{'loss': 1.7444, 'grad_norm': 41.0, 'learning_rate': 1.6685765277094702e-05, 'epoch': 0.34}
{'loss': 1.6448, 'grad_norm': 26.875, 'learning_rate': 1.6602209514140552e-05, 'epoch': 0.34}
{'loss': 1.7149, 'grad_norm': 30.625, 'learning_rate': 1.6517828527121942e-05, 'epoch': 0.35}
{'loss': 1.6457, 'grad_norm': 41.75, 'learning_rate': 1.6432632862996056e-05, 'epoch': 0.35}
{'loss': 1.6901, 'grad_norm': 30.0, 'learning_rate': 1.634663317054829e-05,



{'loss': 1.5694, 'grad_norm': 28.625, 'learning_rate': 1.2380629455214392e-05, 'epoch': 0.48}
{'loss': 1.5206, 'grad_norm': 39.75, 'learning_rate': 1.2271896780409321e-05, 'epoch': 0.48}
{'loss': 1.5593, 'grad_norm': 25.5, 'learning_rate': 1.2162880136443447e-05, 'epoch': 0.49}
{'loss': 1.5353, 'grad_norm': 19.75, 'learning_rate': 1.2053593149536576e-05, 'epoch': 0.49}
{'loss': 1.4984, 'grad_norm': 25.375, 'learning_rate': 1.1944049479699244e-05, 'epoch': 0.49}
{'loss': 1.4339, 'grad_norm': 40.0, 'learning_rate': 1.1834262819025326e-05, 'epoch': 0.5}
{'loss': 1.516, 'grad_norm': 25.0, 'learning_rate': 1.1724246889980638e-05, 'epoch': 0.5}
{'loss': 1.5066, 'grad_norm': 30.0, 'learning_rate': 1.1614015443687723e-05, 'epoch': 0.5}
{'loss': 1.5195, 'grad_norm': 25.625, 'learning_rate': 1.150358225820709e-05, 'epoch': 0.51}
{'loss': 1.5166, 'grad_norm': 38.75, 'learning_rate': 1.1392961136815046e-05, 'epoch': 0.51}
{'loss': 1.4906, 'grad_norm': 30.125, 'learning_rate': 1.1282165906278402e-0



{'loss': 1.4696, 'grad_norm': 22.75, 'learning_rate': 6.8673296793171555e-06, 'epoch': 0.64}
{'loss': 1.501, 'grad_norm': 34.0, 'learning_rate': 6.761354686924895e-06, 'epoch': 0.64}
{'loss': 1.4627, 'grad_norm': 33.0, 'learning_rate': 6.655784499627491e-06, 'epoch': 0.65}
{'loss': 1.4393, 'grad_norm': 25.25, 'learning_rate': 6.550632312864869e-06, 'epoch': 0.65}
{'loss': 1.4318, 'grad_norm': 22.375, 'learning_rate': 6.445911269830189e-06, 'epoch': 0.65}
{'loss': 1.4532, 'grad_norm': 27.125, 'learning_rate': 6.341634459827053e-06, 'epoch': 0.66}
{'loss': 1.4541, 'grad_norm': 21.75, 'learning_rate': 6.237814916633444e-06, 'epoch': 0.66}
{'loss': 1.436, 'grad_norm': 27.75, 'learning_rate': 6.134465616872598e-06, 'epoch': 0.66}
{'loss': 1.4565, 'grad_norm': 27.5, 'learning_rate': 6.0315994783910345e-06, 'epoch': 0.67}
{'loss': 1.485, 'grad_norm': 52.25, 'learning_rate': 5.929229358643932e-06, 'epoch': 0.67}
{'loss': 1.4714, 'grad_norm': 22.75, 'learning_rate': 5.827368053088043e-06, 'epoc



{'loss': 1.4448, 'grad_norm': 35.0, 'learning_rate': 2.3077107376850005e-06, 'epoch': 0.8}
{'loss': 1.4652, 'grad_norm': 40.5, 'learning_rate': 2.2367554625863496e-06, 'epoch': 0.8}
{'loss': 1.501, 'grad_norm': 26.875, 'learning_rate': 2.1667705317636333e-06, 'epoch': 0.81}
{'loss': 1.4949, 'grad_norm': 49.0, 'learning_rate': 2.0977646927806682e-06, 'epoch': 0.81}
{'loss': 1.4546, 'grad_norm': 25.5, 'learning_rate': 2.029746570822524e-06, 'epoch': 0.81}
{'loss': 1.3918, 'grad_norm': 27.375, 'learning_rate': 1.9627246676174363e-06, 'epoch': 0.82}
{'loss': 1.4535, 'grad_norm': 31.125, 'learning_rate': 1.896707360374167e-06, 'epoch': 0.82}
{'loss': 1.4109, 'grad_norm': 33.75, 'learning_rate': 1.8317029007349086e-06, 'epoch': 0.82}
{'loss': 1.367, 'grad_norm': 32.5, 'learning_rate': 1.7677194137439036e-06, 'epoch': 0.83}
{'loss': 1.4498, 'grad_norm': 25.875, 'learning_rate': 1.7047648968318697e-06, 'epoch': 0.83}
{'loss': 1.4274, 'grad_norm': 24.625, 'learning_rate': 1.642847218816398e-06,



{'loss': 1.5046, 'grad_norm': 46.25, 'learning_rate': 8.986034815950173e-08, 'epoch': 0.96}
{'loss': 1.4379, 'grad_norm': 39.0, 'learning_rate': 7.55257502463469e-08, 'epoch': 0.96}
{'loss': 1.4748, 'grad_norm': 38.25, 'learning_rate': 6.243163326014268e-08, 'epoch': 0.97}
{'loss': 1.4439, 'grad_norm': 53.25, 'learning_rate': 5.057963386213116e-08, 'epoch': 0.97}
{'loss': 1.4482, 'grad_norm': 23.875, 'learning_rate': 3.9971233458665495e-08, 'epoch': 0.97}
{'loss': 1.406, 'grad_norm': 53.75, 'learning_rate': 3.0607758016043546e-08, 'epoch': 0.98}
{'loss': 1.4179, 'grad_norm': 20.0, 'learning_rate': 2.2490377894768266e-08, 'epoch': 0.98}
{'loss': 1.4311, 'grad_norm': 111.0, 'learning_rate': 1.562010770326916e-08, 'epoch': 0.98}
{'loss': 1.4804, 'grad_norm': 51.0, 'learning_rate': 9.99780617107815e-09, 'epoch': 0.99}
{'loss': 1.41, 'grad_norm': 24.625, 'learning_rate': 5.6241760414987856e-09, 'epoch': 0.99}
{'loss': 1.3936, 'grad_norm': 35.25, 'learning_rate': 2.4997639837687217e-09, 'epo

TrainOutput(global_step=313, training_loss=1.750066633803395, metrics={'train_runtime': 2721.7709, 'train_samples_per_second': 0.921, 'train_steps_per_second': 0.115, 'train_loss': 1.750066633803395, 'epoch': 1.0})

In [26]:
trainer.save_model(output_dir)



In [27]:
### delete the model state and the trainer
del trainer
del model
gc.collect()
torch.cuda.empty_cache()

### Merge the Peft Model


In [30]:
# redefine the output dir

## Load PeftModel on CPU

adapter_path = f"models/adapters/{modified_model_name}_alignment-handbook"

# base_model = AutoModelForCausalLM.from_pretrained(
#     'microsoft/phi-2', low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
# )
model = AutoPeftModelForCausalLM.from_pretrained(
    adapter_path, adapter_name="sft", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
)
## Merge Adapter and the base model

merged_model = model.merge_and_unload()
merged_model.save_pretrained(
    f"models/{modified_model_name}_alignment-handbook",
    safe_serialization=True,
    max_shard_size="2GB",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Vibe check the model


In [5]:
### Reload the model
ft_model = AutoModelForCausalLM.from_pretrained(
    f"models/{modified_model_name}_alignment-handbook",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
)


tokenizer = AutoTokenizer.from_pretrained(f"models/adapters/{modified_model_name}_alignment-handbook")
tokenizer.padding_side = "left"

# With or without adapter
# ft_model.disable_adapters()
# define the pipeline

pipe = pipeline("text-generation", model=ft_model, tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
prompts = [
    "A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?",
    "It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?",
    "How can i get rid of llamas in my backyard?",
]

In [7]:
for prompt in prompts:
    messages = pipe.tokenizer.apply_chat_template(
        [
            {
                "role": "system",
                "content": "Follow user instructions",
            },
            {"role": "user", "content": prompt},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    # print(messages)
    outputs = pipe(
        messages,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.4,
        top_k=50,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    print(f"**PROMPT**: \n{prompt}\n")
    print(f"**GENERATED ANS**: \n{outputs[0]['generated_text']}")
    print("===" * 10)


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



**PROMPT**: 
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?

**GENERATED ANS**: 
<|im_start|>system
Follow user instructions<|im_end|>
<|im_start|>user
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?<|im_end|>
<|im_start|>assistant
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?



This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


**PROMPT**: 
It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?

**GENERATED ANS**: 
<|im_start|>system
Follow user instructions<|im_end|>
<|im_start|>user
It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?<|im_end|>
<|im_start|>assistant
It's aspirin, a pain reliever and anti-inflammatory agent. It's used to treat headaches, muscle aches, and fever. It's also used to prevent heart attacks and strokes. It's available in tablet, capsule, and liquid form. It's important to follow the instructions on the label and to talk to a doctor before taking aspirin if you have any medical conditions.
 Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azerbijan Azer

In [None]:
del ft_model
gc.collect()
torch.cuda.empty_cache()

#### NOTE: Vibe check the not finetuned phi


### Lets have a look at what the `packing=True` argument does..


In [None]:
from trl.trainer import ConstantLengthDataset

cdl = ConstantLengthDataset(
    tokenizer=tokenizer,
    dataset=train_dataset,
    dataset_text_field="text",
    seq_length=2048,
    shuffle=False,
)

In [None]:
from tqdm import tqdm

for data in tqdm(cdl):
    print(data)
    print(data["input_ids"].shape, data["labels"].shape)

    token_ids = data["input_ids"].tolist()
    print(tokenizer.decode(token_ids))
    break

In [None]:
##### Checking out the behaviour of the llama tokenizer

# llama_tokenizer = AutoTokenizer.from_pretrained(
#     "meta-llama/Llama-2-7b-hf", add_eos_token=True
# )
# llama_tokenizer.pad_token = llama_tokenizer.eos_token

# llama_tokenizer.padding_side = "left"

# llama_tokenizer(
#     ex, add_special_tokens=True, padding="max_length", truncation=True, max_length=25
# )

In [None]:
trained_embeddings = ft_model.get_input_embeddings()
# print(trained_embeddings.weight[tokenizer.pad_token_id])
trained_pad_token = trained_embeddings.modules_to_save.default.weight[
    tokenizer.eos_token_id
]

In [None]:
original_embeddings = model.get_input_embeddings()

original_pad_token = original_embeddings.weight[tokenizer.eos_token_id]

In [None]:
trained_embeddings.modules_to_save.default.weight[50295].equal(
    original_embeddings.weight[50295]
)

In [None]:
original_pad_token.equal(trained_pad_token)