In [3]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float32
)

repo_id = 'microsoft/Phi-3-mini-4k-instruct'

model = AutoModelForCausalLM.from_pretrained(repo_id, device_map = 'cuda:0', quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
print(model.get_memory_footprint()/1e6)

2206.341312


In [6]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r = 8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj']
)

model = get_peft_model(model, config)

In [7]:
train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')


Trainable parameters:      12.58M
Total parameters:          3833.66M
% of trainable parameters: 0.33%


In [8]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset


Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [9]:
dataset[0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [10]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset


Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [11]:
dataset[0]

{'prompt': 'The birch canoe slid on the smooth planks.',
 'completion': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [12]:
def format_dataset(examples):
    if isinstance(examples['prompt'], list):
        output_text = []
        for i in range(len(examples['prompt'])):
            converted_sample = [
                {'role':'user', 'content': examples['prompt'][i]},
                {'role':'assistant', 'content': examples['completion'][i]}
            ]
            output_text.append(converted_sample)
            return {'message': output_text}
        
    else:
        converted_sample = [
                {'role':'user', 'content': examples['prompt']},
                {'role':'assistant', 'content': examples['completion']}
            ]
        
        return {'message':converted_sample}

In [13]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])
dataset[0]

{'message': [{'content': 'The birch canoe slid on the smooth planks.',
   'role': 'user'},
  {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
   'role': 'assistant'}]}

In [19]:
def convert_messages_to_text(example):
    messages = example["message"]
    # turn it into a string
    dialogue = ""
    for msg in messages:
        dialogue += f"<|{msg['role']}|>: {msg['content']}\n"
    return {"text": dialogue}

formatted_dataset = dataset.map(convert_messages_to_text)


Map:   0%|          | 0/720 [00:00<?, ? examples/s]

In [20]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template


"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [21]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id



In [22]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False}, 
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,  
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16, 
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',       
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)


In [24]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=formatted_dataset,
)


Converting train dataset to ChatML:   0%|          | 0/720 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.9994
20,1.7953
30,1.5282
40,1.45
50,1.3895
60,1.2455
70,1.2112
80,0.984
90,0.9365
100,0.6949


TrainOutput(global_step=230, training_loss=0.8132173569306084, metrics={'train_runtime': 2890.7728, 'train_samples_per_second': 1.259, 'train_steps_per_second': 0.08, 'total_flos': 5220859835842560.0, 'train_loss': 0.8132173569306084})

In [26]:
def gen_prompt(tokenizer, sentence):
    converted_sample = [{"role": "user", "content": sentence}]
    prompt = tokenizer.apply_chat_template(
        converted_sample, tokenize=False, add_generation_prompt=True
    )
    return prompt


In [27]:
sentence = 'The Force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(prompt)


<|user|>
The Force is strong in you!<|end|>
<|assistant|>



In [28]:
def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(
        prompt, add_special_tokens=False, return_tensors="pt"
    ).to(model.device)

    model.eval()
    gen_output = model.generate(**tokenized_input,
                                eos_token_id=tokenizer.eos_token_id,
                                max_new_tokens=max_new_tokens)
    
    output = tokenizer.batch_decode(gen_output, skip_special_tokens=skip_special_tokens)
    return output[0]


In [29]:
print(generate(model, tokenizer, prompt))


<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the force is.
<|endoftext|>


In [30]:
trainer.save_model('local-phi3-mini-yoda-adapter')
