In [2]:
!pip install torch numpy pandas datasets transformers peft tqdm evaluate matplotlib scikit-learn bitsandbytes



In [3]:
# DeepSeek R1 1B Fine-tuning on KodCode Dataset
# Setup and Dependencies

import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from tqdm import tqdm
import evaluate
import gc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb

In [4]:
# Check if GPU is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
    print(f"Memory reserved: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")

CUDA available: True
GPU: Tesla P100-PCIE-16GB
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB


In [5]:
# Load the dataset
print("Loading KodCode dataset...")
dataset = load_dataset("KodCode/KodCode-V1-SFT-R1")
print(f"Dataset loaded with {len(dataset['train'])} samples")

Loading KodCode dataset...


README.md:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

train-00000-of-00011.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

train-00001-of-00011.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00002-of-00011.parquet:   0%|          | 0.00/297M [00:00<?, ?B/s]

train-00003-of-00011.parquet:   0%|          | 0.00/138M [00:00<?, ?B/s]

train-00004-of-00011.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

train-00005-of-00011.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00006-of-00011.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

train-00007-of-00011.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

train-00008-of-00011.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00009-of-00011.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00010-of-00011.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

incorrect-00000-of-00011.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

incorrect-00001-of-00011.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

incorrect-00002-of-00011.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

incorrect-00003-of-00011.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

incorrect-00004-of-00011.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

incorrect-00005-of-00011.parquet:   0%|          | 0.00/182M [00:00<?, ?B/s]

incorrect-00006-of-00011.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

incorrect-00007-of-00011.parquet:   0%|          | 0.00/122M [00:00<?, ?B/s]

incorrect-00008-of-00011.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

incorrect-00009-of-00011.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

incorrect-00010-of-00011.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

use_with_caution-00000-of-00001.parquet:   0%|          | 0.00/39.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/268211 [00:00<?, ? examples/s]

Generating incorrect split:   0%|          | 0/210787 [00:00<?, ? examples/s]

Generating use_with_caution split:   0%|          | 0/4439 [00:00<?, ? examples/s]

Dataset loaded with 268211 samples


In [6]:
# Data Exploration
# Display a sample question
sample_idx = 0
print(f"Sample question: {dataset['train'][sample_idx]['question']}")
print(f"Sample solution: {dataset['train'][sample_idx]['solution']}")

# Check dataset structure
print("\nDataset structure:")
print(dataset['train'].column_names)

# Display distribution of question types
subset_counts = dataset['train'].to_pandas()['subset'].value_counts()
print("\nDistribution of question types:")
print(subset_counts)

Sample question: Given a list of integers `nums`, find the maximum product of any two distinct elements in the list. Return the maximum product. For example, given `nums = [5, 3, -1, 9, -7]`, the maximum product would be `5 * 9 = 45`.
Sample solution: def max_product(nums):
    """
    Returns the maximum product of any two distinct elements in the list.
    """
    # Sort the list
    nums.sort()
    
    # The maximum product could be from the two highest values or two lowest values (in case they are negative)
    return max(nums[-1] * nums[-2], nums[0] * nums[1])

Dataset structure:
['version', 'style', 'subset', 'question_id', 'question', 'solution', 'test', 'test_info', 'gpt_pass_sequence', 'gpt_pass_trial_num', 'gpt_difficulty', 'gpt_pass_percentage', 'r1_pass_sequence', 'r1_pass_trial_num', 'r1_correctness', 'r1_solution', 'metadata', 'conversations']

Distribution of question types:
subset
Taco              50917
Prefill           33435
Filter            29359
Leetcode         

In [7]:
# Display difficulty distribution
difficulty_counts = dataset['train'].to_pandas()['gpt_difficulty'].value_counts()
print("\nDistribution of difficulty levels:")
print(difficulty_counts)


Distribution of difficulty levels:
gpt_difficulty
easy      159623
medium     61856
hard       46732
Name: count, dtype: int64


In [8]:
# Data Preprocessing
def format_data_for_training(example):
    """Format the data for training with proper prompts and completions."""
    # Create a prompt in the format DeepSeek expects
    prompt = f"### Question:\n{example['question']}\n\n### Solution:\n"
    completion = example['solution']

    # For our training, we'll combine prompt and completion
    example['formatted_text'] = prompt + completion

    return example

print("Formatting data for training...")
formatted_dataset = dataset['train'].map(format_data_for_training)

# Split the dataset into training and validation sets
train_val_data = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_data = train_val_data['train']
val_data = train_val_data['test']

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

Formatting data for training...


Map:   0%|          | 0/268211 [00:00<?, ? examples/s]

Training set size: 241389
Validation set size: 26822


In [9]:
!pip install -U bitsandbytes



In [10]:
# Model Setup
# Load base model and tokenizer
model_name = "deepseek-ai/deepseek-coder-1.3b-base"

print(f"Loading model and tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Setup 4-bit quantization for memory efficiency
print("Setting up 4-bit quantization...")
import bitsandbytes as bnb
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_compute_dtype": torch.float16,
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_use_double_quant": True,
    }
)

# Configure LoRA adaptation
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]
)

# Prepare model for training
print("Preparing model for training...")
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

Loading model and tokenizer: deepseek-ai/deepseek-coder-1.3b-base


tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Setting up 4-bit quantization...


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Preparing model for training...


In [11]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["formatted_text"],
        truncation=True,
        max_length=2048,
        padding="max_length"
    )

print("Tokenizing datasets...")
tokenized_train = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val = val_data.map(
    tokenize_function,
    batched=True,
    remove_columns=val_data.column_names
)

Tokenizing datasets...


Map:   0%|          | 0/241389 [00:00<?, ? examples/s]

Map:   0%|          | 0/26822 [00:00<?, ? examples/s]

In [None]:
# Training Configuration
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    warmup_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none",
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Training
print("Starting training...")
trainer.train()



Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


In [None]:
# Save the model
print("Saving model...")
trainer.model.save_pretrained("./deepseek-r1-kodcode-finetuned")
tokenizer.save_pretrained("./deepseek-r1-kodcode-finetuned")

In [None]:
# Evaluation
# Define evaluation function
def evaluate_coding_task(model, tokenizer, question, max_length=2048):
    prompt = f"### Question:\n{question}\n\n### Solution:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    solution = generated_text.split("### Solution:\n")[1].strip()

    return solution

# Evaluate the model on a sample of test questions
print("Evaluating model on sample test questions...")
test_questions = dataset['train'].select(range(5))['question']  # Just using 5 samples for demonstration

In [None]:
# Load the original model for comparison
print("Loading original model for comparison...")
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True
)

# Compare solutions
print("Comparing solutions between original and fine-tuned models...")
for i, question in enumerate(test_questions):
    print(f"\nQuestion {i+1}:")
    print(question)

    print("\nOriginal model solution:")
    original_solution = evaluate_coding_task(original_model, tokenizer, question)
    print(original_solution)

    print("\nFine-tuned model solution:")
    finetuned_solution = evaluate_coding_task(model, tokenizer, question)
    print(finetuned_solution)

In [None]:
# Cleanup
del original_model
del model
gc.collect()
torch.cuda.empty_cache()

print("Evaluation complete!")