# Finetune tinyLlama 1.1B Model Training with unsloth library on RTX 3060 GPU

This notebook implements optimized training of TinyLlama 1.1B on hardware with limited VRAM (6GB RTX 3060), uses unsloth library and HuggingFace accelerate library.

In [None]:
import torch
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel
import os
import psutil
import gc

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device properties: {torch.cuda.get_device_properties(0)}")
    print(f"Current GPU memory usage: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

print(f"CPU count: {os.cpu_count()}")
print(f"Available memory: {psutil.virtual_memory().available / (1024 * 1024 * 1024):.2f} GB")

In [None]:
# Model and training configuration
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 2
MAX_TRAIN_SAMPLES = 10000
MAX_VAL_SAMPLES = 500
GRADIENT_ACCUMULATION = 8

In [None]:
# Load dataset with streaming to reduce memory usage
print("Loading dataset...")
ds = load_dataset("KodCode/KodCode-V1", streaming=False)
print(f"Dataset: {ds}")

def format_instruction(example):
    return {
        "instruction": example["question"],
        "input": "",
        "output": example["solution"]
    }

formatted_ds = ds["train"].map(format_instruction)
formatted_ds = formatted_ds.shuffle(seed=42)
train_ds = formatted_ds.select(range(min(MAX_TRAIN_SAMPLES, len(formatted_ds))))
val_ds = formatted_ds.select(range(MAX_TRAIN_SAMPLES, min(MAX_TRAIN_SAMPLES + MAX_VAL_SAMPLES, len(formatted_ds))))

print(f"Training examples: {len(train_ds)}")
print(f"Validation examples: {len(val_ds)}")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Tokenize function with dynamic padding for better memory efficiency
def tokenize_function(examples):
    formatted_texts = [f"<s>[INST] {inst} [/INST] {out}</s>" 
                      for inst, out in zip(examples["instruction"], examples["output"])]
    
    tokenized_inputs = tokenizer(
        formatted_texts,
        truncation=True,
        padding=False,
        max_length=MAX_SEQ_LENGTH,
        return_tensors=None
    )
    
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

In [None]:
# Process datasets in smaller batches
print("Tokenizing training dataset...")
tokenized_train_ds = train_ds.map(
    tokenize_function,
    batched=True,
    batch_size=100,
    remove_columns=train_ds.column_names
)

print("Tokenizing validation dataset...")
tokenized_val_ds = val_ds.map(
    tokenize_function,
    batched=True,
    batch_size=100,
    remove_columns=val_ds.column_names
)

In [None]:
# Create data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

In [None]:
# Load and prepare model with optimized settings
print("Loading model...")
model, _ = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.float16,
    load_in_4bit=True,
    device_map="auto",
    attn_implementation="flash_attention_2"
)

# Configure LoRA parameters - use very small LoRA for memory efficiency
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
)

# Print model parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%")

In [None]:
# Configure training with optimized settings
training_args = TrainingArguments(
    output_dir="./kodcode_llama_model",
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    optim="adamw_torch",
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    report_to="tensorboard",
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    torch_compile=False,
    # Additional memory optimizations
    max_grad_norm=1.0,
    group_by_length=True,
    remove_unused_columns=True,
    ddp_find_unused_parameters=False,
    disable_tqdm=False,
)

In [None]:
# Create trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    data_collator=data_collator,
)

# Monitor memory usage before training
if torch.cuda.is_available():
    print(f"GPU memory allocated before training: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"GPU memory reserved before training: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

# Start training
print("Starting training...")
trainer.train()

# Save the model
model_save_path = "./kodcode_llama_model_final"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
# Final memory usage report
if torch.cuda.is_available():
    print(f"Final GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"Final GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

print("Training complete!")

## Model Testing

After training, you can test the model with this code:

In [None]:
# Load the trained model for inference
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

peft_model_path = "./kodcode_llama_model_final"
config = PeftConfig.from_pretrained(peft_model_path)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# Load the LoRA adapter
model = PeftModel.from_pretrained(base_model, peft_model_path)

# Test the model
prompt = "<s>[INST] Write a Python function to calculate the factorial of a number [/INST]"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.2,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=False))