In [None]:
import torch
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer
from unsloth import FastLanguageModel
import os
import psutil

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device properties: {torch.cuda.get_device_properties(0)}")

print(f"CPU count: {os.cpu_count()}")
print(
    f"Available memory: {psutil.virtual_memory().available / (1024 * 1024 * 1024):.2f} GB"
)

BASE_MODEL = (
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
)
MAX_SEQ_LENGTH = 1024
BATCH_SIZE = 2

In [None]:
ds = load_dataset("KodCode/KodCode-V1", streaming=True)
print(f"Dataset: {ds}")

In [None]:
def format_instruction(example):
    return {
        "instruction": example["question"],
        "input": "",
        "output": example["solution"],
    }

In [None]:
formatted_ds = ds["train"].map(format_instruction)
split_ds = formatted_ds.train_test_split(test_size=0.05, seed=42)
train_ds = split_ds["train"]
val_ds = split_ds["test"]

print(f"Training examples: {len(train_ds)}")
print(f"Validation examples: {len(val_ds)}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def tokenize_function(examples):
    formatted_texts = [
        f"<s>[INST] {inst} [/INST] {out}</s>"
        for inst, out in zip(examples["instruction"], examples["output"])
    ]

    tokenized_inputs = tokenizer(
        formatted_texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_SEQ_LENGTH,
        return_tensors="pt",
    )

    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

tokenized_train_ds = train_ds.map(
    tokenize_function,
    batched=True,
    batch_size=100,
    remove_columns=train_ds.column_names,
)

tokenized_val_ds = val_ds.map(
    tokenize_function, batched=True, batch_size=100, remove_columns=val_ds.column_names
)

In [None]:
model, _ = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.float16,
    load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0.05,
)

In [None]:
# Print model parameters to verify configuration
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%")

In [None]:
# Configure training with optimized settings
training_args = TrainingArguments(
    output_dir="./kodcode_llama_model",
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    optim="adamw_torch",
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    report_to="tensorboard",
    # Additional settings for efficiency
    no_cuda=not torch.cuda.is_available(),
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    torch_compile=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
)

if torch.cuda.is_available():
    print(
        f"GPU memory allocated before training: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB"
    )
    print(
        f"GPU memory reserved before training: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB"
    )

print("Starting training...")
trainer.train()

In [None]:
model_save_path = "./kodcode_llama_model_final"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

if torch.cuda.is_available():
    print(
        f"Final GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB"
    )
    print(
        f"Final GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB"
    )

print("Training complete!")