<a href="https://colab.research.google.com/github/nk3843/Finetuning/blob/main/Yoda_speak.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -U datasets bitsandbytes trl

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinu

In [1]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [26]:
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(repo_id,
                                             device_map="cuda:0",
                                             quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
print(model.get_memory_footprint()/1e6)

2206.341312


In [28]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLUActivation()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=

In [29]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train
    lora_alpha=16,         # multiplier, usually 2*r
    bias="none",           # BEWARE: training biases *modifies* base model's behavior
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [30]:
print(model.get_memory_footprint()/1e6)

2651.074752


In [31]:
train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters: {train_p/1e6:.2f}M')
print(f'Total parameters: {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

Trainable parameters: 12.58M
Total parameters: 3833.66M
% of trainable parameters: 0.33%


In [41]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [42]:
dataset[0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [43]:
# Adapted from trl.extras.dataset_formatting.instructions_formatting_function
# Converts dataset from prompt/completion format (not supported anymore)
# to the conversational format
def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": examples["prompt"][i]},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

In [44]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.map(format_dataset)
dataset = dataset.remove_columns(['prompt', 'completion', 'translation'])
messages = dataset[0]['messages']
messages

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

[{'content': 'The birch canoe slid on the smooth planks.', 'role': 'user'},
 {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
  'role': 'assistant'}]

In [36]:
{"messages":[
  {"role": "system", "content": "<general directives>"},
  {"role": "user", "content": "<prompt text>"},
  {"role": "assistant", "content": "<ideal generated text>"}
]}

[{'role': 'user', 'content': 'The birch canoe slid on the smooth planks.'},
 {'role': 'user',
  'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}]

In [45]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

tokenizer.chat_template

"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [46]:
print(tokenizer.apply_chat_template(messages, tokenize=False))

<|user|>
The birch canoe slid on the smooth planks.<|end|>
<|assistant|>
On the smooth planks, the birch canoe slid. Yes, hrrrm.<|end|>
<|endoftext|>


In [47]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,
    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_length=64, # renamed in v0.20
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,
    packing_strategy='wrapped', # added to approximate original packing behavior

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',

    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none',

    # ensures bf16 (the new default) is only used when it is actually available
    bf16=torch.cuda.is_bf16_supported(including_emulation=False)
)

In [48]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

In [49]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [50]:
batch["input_ids"][0], batch["labels"][0]

(tensor([ 3974, 29892,  4337,   278,   325,   271, 29892,   366,  1818, 29889,
         32007, 32000, 32010,   450,   289,   935,   310,   278,   282,   457,
          5447,   471,   528,  4901,   322,  6501, 29889, 32007, 32001, 26399,
          1758,  4317, 29889,  1383,  4901,   322,  6501, 29892,   278,   289,
           935,   310,   278,   282,   457,  5447,   471, 29889, 32007, 32000,
         32010,   951,  5989,  2507, 17354,   322, 13328,   297,   278,  6416,
         29889, 32007, 32001,   512], device='cuda:0'),
 tensor([ 3974, 29892,  4337,   278,   325,   271, 29892,   366,  1818, 29889,
         32007, 32000, 32010,   450,   289,   935,   310,   278,   282,   457,
          5447,   471,   528,  4901,   322,  6501, 29889, 32007, 32001, 26399,
          1758,  4317, 29889,  1383,  4901,   322,  6501, 29892,   278,   289,
           935,   310,   278,   282,   457,  5447,   471, 29889, 32007, 32000,
         32010,   951,  5989,  2507, 17354,   322, 13328,   297,   278,  64

In [51]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss
10,2.8508
20,1.8263
30,1.6059
40,1.5232
50,1.3992
60,1.2963
70,1.1928
80,0.9938
90,0.9
100,0.6327


TrainOutput(global_step=220, training_loss=0.8357803572307934, metrics={'train_runtime': 2655.7181, 'train_samples_per_second': 1.284, 'train_steps_per_second': 0.083, 'total_flos': 4890970340720640.0, 'train_loss': 0.8357803572307934, 'epoch': 10.0})

In [52]:
def gen_prompt(tokenizer, sentence):
    converted_sample = [
        {"role": "user", "content": sentence},
    ]
    prompt = tokenizer.apply_chat_template(converted_sample,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt


In [53]:
sentence = 'The Force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(prompt)

<|user|>
The Force is strong in you!<|end|>
<|assistant|>



In [57]:
from contextlib import nullcontext
def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

    model.eval()
    # if it was trained using mixed precision, uses autocast context
    ctx = torch.autocast(device_type=model.device.type, dtype=model.dtype) \
          if model.dtype in [torch.float16, torch.bfloat16] else nullcontext()
    with ctx:
        generation_output = model.generate(**tokenized_input,
                                           eos_token_id=tokenizer.eos_token_id,
                                           max_new_tokens=max_new_tokens)

    output = tokenizer.batch_decode(generation_output,
                                    skip_special_tokens=skip_special_tokens)
    return output[0]

In [58]:
print(generate(model, tokenizer, prompt))

<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the Force is! Yes, hrrrm.<|end|><|endoftext|>


In [59]:
trainer.save_model('local-phi3-mini-yoda-adapter')

In [60]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [62]:
trainer.push_to_hub()

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...a-adapter/tokenizer.model: 100%|##########|  500kB /  500kB            

  ...adapter_model.safetensors:   1%|1         |  560kB / 50.4MB            

  ...adapter/training_args.bin:   1%|1         |  68.0B / 6.16kB            

CommitInfo(commit_url='https://huggingface.co/nikhilviky/phi3-mini-yoda-adapter/commit/38a1ed8c934eaf8c4e7b9f77f6c86adcc3a5263b', commit_message='End of training', commit_description='', oid='38a1ed8c934eaf8c4e7b9f77f6c86adcc3a5263b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nikhilviky/phi3-mini-yoda-adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='nikhilviky/phi3-mini-yoda-adapter'), pr_revision=None, pr_num=None)