In [1]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
import torch
from transformers import TrainingArguments
from datasets import DatasetDict
from rich import print

In [2]:
max_seq_length = 4096  # Supports RoPE Scaling interally, so choose any!
# get instruction dataset from ../data/instructions
dataset = DatasetDict.load_from_disk('../data/instructions')
dataset_train = dataset["train"]
dataset_test = dataset["test"]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="vilm/vinallama-7b",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing=True,
    random_state=3407,
    max_seq_length=max_seq_length,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=False,  # And LoftQ
)



==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3070 Ti. Max memory: 8.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth: vilm/vinallama-7b has no tokenizer.model file.
Just informing you about this - this is not a critical error.


vilm/vinallama-7b does not have a padding token! Will use pad_token = <unk>.
Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
def format_prompt_func(example):
    if isinstance(example['instruction'], str):
        return f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
    
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Instruction:\n{example['instruction'][i]}\n\n### Response:\n{example['response'][i]}"
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    formatting_func=format_prompt_func,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=TrainingArguments(
        num_train_epochs=2,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=0.1,
        eval_steps=0.1,
        save_steps=0.1,
        output_dir="../models/vinallama-7b",
        optim="adamw_8bit",
        seed=3407,
    ),
)
trainer.train()

# Go to https://github.com/unslothai/unsloth/wiki for advanced tips like
# (1) Saving to GGUF / merging to 16bit for vLLM
# (2) Continued training from a saved LoRA adapter
# (3) Adding an evaluation loop / OOMs
# (4) Cutomized chat templates

Map:   0%|          | 0/1643 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,571 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 3,284
 "-____-"     Number of trainable parameters = 39,976,960


Step,Training Loss
329,1.0498
658,0.756
987,0.6327
1316,0.5362
1645,0.4967
1974,0.3863
2303,0.3465
2632,0.3317
2961,0.3113




TrainOutput(global_step=3284, training_loss=0.5143165611610924, metrics={'train_runtime': 4747.1961, 'train_samples_per_second': 2.768, 'train_steps_per_second': 0.692, 'total_flos': 4.89315585663959e+16, 'train_loss': 0.5143165611610924, 'epoch': 1.9990868969715416})