## Fine-tuning Qwen 2.5 0.5B Instruct on KonCode Dataset for improving Coding Reasoning

In [1]:
# Fine-tuning Qwen2.5-0.5B-Instruct on KodCode Dataset for Coding Reasoning
# This cell sets up the environment for fine-tuning the Qwen2.5-0.5B-Instruct model on the KodCode dataset
!pip install -q datasets transformers evaluate accelerate torch bitsandbytes peft

# pytorch - a deep learning framework that provides tensor computations with GPU acceleration
# datasets - Offers efficient data loading, processing, and caching, Provides tools for working with ML datasets
# bitsandbytes - Enables memory-efficient training of large models & Reduces memory usage by quantizing model weights
# peft (Parameter-Efficient Fine-Tuning) - Implements efficient fine-tuning methods like LoRA
# transformers - Contains pre-trained language models and tools to work with them 

# Set up the environment and check CUDA availability
import torch
import os
import random 
from datasets import load_dataset
import bitsandbytes as bnb
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_from_disk

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCUDA available: True
CUDA device: Tesla P100-PCIE-16GB
CUDA version: 12.1
GPU memory: 17.06 GB


This cell prepares the environment for fine-tuning Qwen2.5-0.5B-Instruct on the KodCode dataset. It installs required libraries, imports essential modules, and checks GPU availability. The setup enables QLoRA fine-tuning with 4-bit quantization, allowing the model to learn coding skills while minimizing memory usage on Colab's T4 GPU.

In [2]:
def load_and_examine_dataset(dataset_name, split="train"):
    """
    Load a dataset from Hugging Face and examine its format

    Args:
        dataset_name: Name of the dataset on Hugging Face Hub
        split: Dataset split to load (default: "train")
        
    Returns:
        The loaded dataset
    """
    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)
    
    # Print dataset info
    print(f"Dataset: {dataset_name}")
    print(f"Number of examples: {len(dataset)}")
    print(f"Dataset features: {dataset.features}")
    
    # Print a sample example
    print("\nSample example:")
    sample = dataset[0]
    for key, value in sample.items():
        if isinstance(value, str) and len(value) > 100:
            print(f"{key}: {value[:100]}... (truncated)")
        else:
            print(f"{key}: {value}")
    
    return dataset

dataset = load_and_examine_dataset("KodCode/KodCode-V1")

README.md:   0%|          | 0.00/8.67k [00:00<?, ?B/s]

train-00000-of-00015.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00001-of-00015.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

train-00002-of-00015.parquet:   0%|          | 0.00/154M [00:00<?, ?B/s]

train-00003-of-00015.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

train-00004-of-00015.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00005-of-00015.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

train-00006-of-00015.parquet:   0%|          | 0.00/88.7M [00:00<?, ?B/s]

train-00007-of-00015.parquet:   0%|          | 0.00/138M [00:00<?, ?B/s]

train-00008-of-00015.parquet:   0%|          | 0.00/181M [00:00<?, ?B/s]

train-00009-of-00015.parquet:   0%|          | 0.00/146M [00:00<?, ?B/s]

train-00010-of-00015.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

train-00011-of-00015.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

train-00012-of-00015.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00013-of-00015.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00014-of-00015.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

use_with_caution-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/484097 [00:00<?, ? examples/s]

Generating use_with_caution split:   0%|          | 0/3335 [00:00<?, ? examples/s]

Dataset: KodCode/KodCode-V1
Number of examples: 484097
Dataset features: {'version': Value(dtype='string', id=None), 'style': Value(dtype='string', id=None), 'subset': Value(dtype='string', id=None), 'question_id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'solution': Value(dtype='string', id=None), 'test': Value(dtype='string', id=None), 'test_info': [{'docstring': Value(dtype='string', id=None), 'function_declaration': Value(dtype='string', id=None), 'function_name': Value(dtype='string', id=None), 'parameter_list': Value(dtype='string', id=None)}], 'gpt_pass_sequence': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'gpt_pass_trial_num': Value(dtype='int64', id=None), 'gpt_difficulty': Value(dtype='string', id=None), 'gpt_pass_percentage': Value(dtype='float64', id=None), 'trials': {'trial_gpt4o_0': {'file_source': Value(dtype='string', id=None), 'solution_code': Value(dtype='string', id=None), 'test_code': Value(dtype='string',

In [3]:
# Load the Qwen tokenizer & Load the dataset
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset("KodCode/KodCode-V1", split="train")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

This code preprocesses the KodCode dataset for fine-tuning the Qwen2.5-0.5B-Instruct model. It formats each example as an instruction-response pair using the chat template format required by Qwen models. The process includes extracting coding problems and solutions, formatting them with special tokens, tokenizing the text, and splitting the dataset into training and validation sets. The code also saves samples of the processed data for inspection and stores the tokenized datasets to disk for efficient training.

In [4]:
# Function to format examples for instruction fine-tuning
def format_instruction(example):
    # Extract the question and solution
    question = example["question"]
    solution = example["solution"]
    
    # Format as instruction-response pair
    instruction = f"Write a function to solve the following coding problem:\n\n{question}"
    response = solution
    
    # Create the formatted prompt
    formatted_text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
    
    return {"text": formatted_text}

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
# Split the dataset into train and validation sets (95% train, 5% validation)
formatted_dataset = formatted_dataset.train_test_split(test_size=0.05, seed=42)

# Function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=2048,
        padding="max_length",
        return_tensors="pt"
    )

# Tokenize the dataset
tokenized_dataset = {}
tokenized_dataset["train"] = formatted_dataset["train"].map(
    tokenize_function, 
    batched=True,
    remove_columns=["text"]
)
tokenized_dataset["validation"] = formatted_dataset["test"].map(
    tokenize_function, 
    batched=True,
    remove_columns=["text"]
)

# Save a sample of the processed dataset for inspection
sample_size = min(5, len(tokenized_dataset["train"]))
sample_indices = random.sample(range(len(tokenized_dataset["train"])), sample_size)
sample_data = [tokenized_dataset["train"][i] for i in sample_indices]

print(f"Processed {len(tokenized_dataset['train'])} training examples")
print(f"Processed {len(tokenized_dataset['validation'])} validation examples")
print(f"Sample of processed data (decoded):")
for i, sample in enumerate(sample_data):
    print(f"\nSample {i+1}:")
    decoded_text = tokenizer.decode(sample["input_ids"])
    print(decoded_text[:500] + "..." if len(decoded_text) > 500 else decoded_text)

# Save the processed dataset
tokenized_dataset["train"].save_to_disk("./processed_dataset_train")
tokenized_dataset["validation"].save_to_disk("./processed_dataset_validation")

Map:   0%|          | 0/484097 [00:00<?, ? examples/s]

Map:   0%|          | 0/459892 [00:00<?, ? examples/s]

Map:   0%|          | 0/24205 [00:00<?, ? examples/s]

Processed 459892 training examples
Processed 24205 validation examples
Sample of processed data (decoded):

Sample 1:
<|im_start|>user
Write a function to solve the following coding problem:

### Context
Bob is building a system that requires maintaining and retrieving user session data. He has a list of session intervals with start and end timestamps for various users and needs to determine the number of active sessions at any given time.

### Task
Write a function `find_maximum_active_sessions` that takes a list of session intervals and returns the maximum number of active sessions at any point in time. Each ...

Sample 2:
<|im_start|>user
Write a function to solve the following coding problem:

Given a list of `n` integers representing the heights of vertical lines on a histogram, where the width of each bar is 1, return _the area of the largest rectangle that can be formed within the histogram._ The rectangle must be formed using consecutive bars in the histogram.

### Format: Gene

Saving the dataset (0/10 shards):   0%|          | 0/459892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24205 [00:00<?, ? examples/s]

This code sets up the Qwen2.5-0.5B-Instruct model with 4-bit quantization and LoRA for efficient fine-tuning. It configures the quantization parameters, loads the model and tokenizer, applies LoRA with optimized parameters, and prepares the dataset for training. The code also reduces the dataset size to speed up the training process.

In [5]:
# Set up quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load the model with quantization
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration - reduced rank for faster training
lora_config = LoraConfig(
    r=8,                      # Reduced rank for faster training
    lora_alpha=16,            # Adjusted alpha
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Reduced target modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {model.print_trainable_parameters()}")

# Load the processed dataset
train_dataset = load_from_disk("./processed_dataset_train")
eval_dataset = load_from_disk("./processed_dataset_validation")

# Use a smaller subset for faster training
train_size = min(5000, len(train_dataset))
eval_size = min(500, len(eval_dataset))
train_dataset = train_dataset.select(range(train_size))
eval_dataset = eval_dataset.select(range(eval_size))

print(f"Training on {train_size} examples, evaluating on {eval_size} examples")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184
Trainable parameters: None
Training on 5000 examples, evaluating on 500 examples


This code sets up the training arguments and executes the fine-tuning process for the Qwen2.5-0.5B-Instruct model. It configures training parameters optimized for memory efficiency, trains the model using the Trainer API, and saves the fine-tuned model to disk.

In [6]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen_qlora_output",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    warmup_steps=100,
    optim="adamw_torch",
    report_to="tensorboard",
    gradient_checkpointing=True,
    remove_unused_columns=False,
    max_grad_norm=0.3,
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("./qwen_qlora_final")
tokenizer.save_pretrained("./qwen_qlora_final")

print("Training completed and model saved!")

  trainer = Trainer(


Step,Training Loss,Validation Loss


KeyboardInterrupt: 