# Qwen2.5-Coder Fine-tuning on Kaggle P100 GPU

Complete end-to-end fine-tuning pipeline for Qwen2.5-Coder-3B to generate Python unit tests.

**Dataset**: https://www.kaggle.com/datasets/ujwalsr/finetuning
**GPU**: Kaggle P100 (16GB VRAM)
**Method**: LoRA fine-tuning with 4-bit quantization

---

## 1. Import Required Libraries

Installing and importing all necessary dependencies for training.

In [3]:
# Install required packages
!pip install -q transformers>=4.36.0
!pip install -q peft>=0.7.0
!pip install -q bitsandbytes>=0.41.0
!pip install -q accelerate>=0.24.0
!pip install -q datasets
!pip install -q tqdm

print("✅ Packages installed successfully!")

✅ Packages installed successfully!


In [1]:
# Import essential libraries
import os
import sys
import pickle
import torch
import torch.nn as nn
import gc
import time
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import numpy as np
import pandas as pd

# Transformers and PEFT
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    TrainingArguments, Trainer,
    BitsAndBytesConfig, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Check GPU availability
print(f"🖥️  CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name()}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
    
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

2025-08-21 18:16:36.061967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755800196.084473     201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755800196.091188     201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🖥️  CUDA available: True
   GPU: Tesla P100-PCIE-16GB
   GPU Memory: 15.9GB


## 2. Load and Prepare Dataset

Loading the three pickle files from the Kaggle dataset and examining the data structure.

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Dataset paths (Kaggle input directory)
dataset_dir = "/kaggle/input/finetuning"

train_pkl_path = f"{dataset_dir}/train_split.pkl"
val_pkl_path = f"{dataset_dir}/val_split.pkl"
test_pkl_path = f"{dataset_dir}/test_split.pkl"

# Verify files exist
print(f"\n🔍 Checking for dataset files in: {dataset_dir}")
for path in [train_pkl_path, val_pkl_path, test_pkl_path]:
    if os.path.exists(path):
        file_size = os.path.getsize(path) / 1024 / 1024  # Size in MB
        print(f"✅ Found: {path} ({file_size:.1f} MB)")
    else:
        print(f"❌ Missing: {path}")
        
# List all files in dataset directory
if os.path.exists(dataset_dir):
    print(f"\n📁 Dataset directory contents:")
    for file in os.listdir(dataset_dir):
        file_path = os.path.join(dataset_dir, file)
        if os.path.isfile(file_path):
            file_size = os.path.getsize(file_path) / 1024 / 1024
            print(f"   - {file} ({file_size:.1f} MB)")
        else:
            print(f"   - {file}/ (directory)")
else:
    print(f"❌ Dataset directory not found: {dataset_dir}")
    print("📋 Available input directories:")
    for item in os.listdir("/kaggle/input"):
        print(f"   - /kaggle/input/{item}")

/kaggle/input/finetuning/train_split.pkl
/kaggle/input/finetuning/val_split.pkl
/kaggle/input/finetuning/test_split.pkl

🔍 Checking for dataset files in: /kaggle/input/finetuning
✅ Found: /kaggle/input/finetuning/train_split.pkl (2198.0 MB)
✅ Found: /kaggle/input/finetuning/val_split.pkl (274.8 MB)
✅ Found: /kaggle/input/finetuning/test_split.pkl (273.7 MB)

📁 Dataset directory contents:
   - train_split.pkl (2198.0 MB)
   - val_split.pkl (274.8 MB)
   - test_split.pkl (273.7 MB)


In [3]:
# Load pickle datasets
print("📂 Loading pickle datasets...")

with open(train_pkl_path, 'rb') as f:
    train_data = pickle.load(f)
    
with open(val_pkl_path, 'rb') as f:
    val_data = pickle.load(f)
    
with open(test_pkl_path, 'rb') as f:
    test_data = pickle.load(f)

print(f"✅ Datasets loaded successfully:")
print(f"   - Training: {len(train_data):,} samples")
print(f"   - Validation: {len(val_data):,} samples")
print(f"   - Test: {len(test_data):,} samples")
print(f"   - Total: {len(train_data) + len(val_data) + len(test_data):,} samples")

📂 Loading pickle datasets...
✅ Datasets loaded successfully:
   - Training: 14,049 samples
   - Validation: 1,756 samples
   - Test: 1,757 samples
   - Total: 17,562 samples


In [4]:
# Examine data structure
print("🔍 Examining data structure...")

if len(train_data) > 0:
    sample = train_data[0]
    print(f"\n📋 Sample keys: {list(sample.keys())}")
    
    # Check if data is pre-tokenized or raw text
    if 'input_ids' in sample:
        print("✅ Data is pre-tokenized")
        print(f"   - Input IDs length: {len(sample['input_ids'])}")
        print(f"   - Has attention mask: {'attention_mask' in sample}")
        print(f"   - Has labels: {'labels' in sample}")
    else:
        print("📝 Data contains raw text")
        for key in sample.keys():
            if isinstance(sample[key], str):
                print(f"   - {key}: {len(sample[key])} characters")
                print(f"     Preview: {sample[key][:100]}...")
            else:
                print(f"   - {key}: {type(sample[key])}")

🔍 Examining data structure...

📋 Sample keys: ['task_id', 'question', 'code_ground_truth', 'code_generate', 'unit_tests']
📝 Data contains raw text
   - task_id: <class 'int'>
   - question: 1355 characters
     Preview: As AtCoder Beginner Contest 100 is taking place, the office of AtCoder, Inc. is decorated with a seq...
   - code_ground_truth: 147 characters
     Preview: def max_operations_on_sequence(N, a):
    ans = 0
    for i in a:
        while i % 2 == 0:
        ...
   - code_generate: 10335 characters
     Preview: [{"sol_id": 0, "code": "def max_operations_on_sequence(N, a):\n    \"\"\"\n    Calculate the maximum...
   - unit_tests: 133288 characters
     Preview: [{"ut_id": 0, "code": "import unittest\n\nclass TestMaxOperationsOnSequence(unittest.TestCase):\n\n ...


## 3. Data Preprocessing and Feature Engineering

Setting up the PyTorch Dataset class and data preprocessing pipeline.

In [55]:
# Training configuration for Kaggle P100
@dataclass
class KaggleTrainingConfig:
    """Optimized configuration for Kaggle P100 GPU (16GB VRAM)"""
    
    # Model configuration
    model_name: str = "Qwen/Qwen2.5-Coder-3B-Instruct"
    max_length: int = 512  # P100 can handle longer sequences
    
    # Training parameters - P100 optimized
    train_batch_size: int = 1
    eval_batch_size: int = 2
    gradient_accumulation_steps: int = 16  # Effective batch size: 16
    num_epochs: int = 3
    learning_rate: float = 2e-4
    weight_decay: float = 0.001
    warmup_ratio: float = 0.03
    
    # LoRA configuration
    lora_r: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    target_modules: List[str] = None
    
    # Output configuration
    output_dir: str = "/kaggle/working/qwen-coder-finetune"
    run_name: str = "qwen-coder-unittest-kaggle"
    logging_steps: int = 10
    save_steps: int = 500
    eval_steps: int = 500
    
    # Hardware optimization for P100
    use_cuda: bool = True
    mixed_precision: bool = True
    gradient_checkpointing: bool = True  # Enable for P100
    
    def __post_init__(self):
        if self.target_modules is None:
            self.target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"
            ]

# Initialize configuration
config = KaggleTrainingConfig()
print(f"📋 Training Configuration:")
print(f"   - Model: {config.model_name}")
print(f"   - Epochs: {config.num_epochs}")
print(f"   - Batch size: {config.train_batch_size}")
print(f"   - Effective batch size: {config.train_batch_size * config.gradient_accumulation_steps}")
print(f"   - Learning rate: {config.learning_rate}")
print(f"   - Max length: {config.max_length}")

📋 Training Configuration:
   - Model: Qwen/Qwen2.5-Coder-3B-Instruct
   - Epochs: 3
   - Batch size: 1
   - Effective batch size: 16
   - Learning rate: 0.0002
   - Max length: 512


In [56]:
# PyTorch Dataset class for unit test generation
class UnitTestDataset(Dataset):
    """PyTorch Dataset for unit test generation training"""
    
    def __init__(self, data: List[Dict], tokenizer, max_length: int = 1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        print(f"Dataset initialized with {len(data):,} samples")
        if len(data) > 0:
            print(f"Sample keys: {list(data[0].keys())}")
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        item = self.data[idx]
        
        # Handle pre-tokenized data
        if 'input_ids' in item and 'labels' in item:
            return {
                'input_ids': torch.tensor(item['input_ids'][:self.max_length], dtype=torch.long),
                'attention_mask': torch.tensor(item['attention_mask'][:self.max_length], dtype=torch.long),
                'labels': torch.tensor(item['labels'][:self.max_length], dtype=torch.long)
            }
        
        # Handle raw text data
        else:
            code = item.get('code', '')
            unit_test = item.get('unit_test', '')
            
            # Create training prompt
            prompt = f"# Generate a unit test for the following Python function:\n{code}\n\n# Unit test:\n{unit_test}"
            
            # Tokenize
            encoded = self.tokenizer(
                prompt,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            return {
                'input_ids': encoded['input_ids'].squeeze(),
                'attention_mask': encoded['attention_mask'].squeeze(),
                'labels': encoded['input_ids'].squeeze().clone()
            }

print("✅ UnitTestDataset class defined")

✅ UnitTestDataset class defined


## 4. Model Architecture Setup

Loading the Qwen2.5-Coder model with quantization and setting up LoRA fine-tuning.

In [57]:
# Utility functions
def clear_memory():
    """Clear GPU and system memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

def check_gpu_memory():
    """Check and print GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"🖥️  GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
    else:
        print("❌ CUDA not available")

def create_quantization_config() -> BitsAndBytesConfig:
    """Create 4-bit quantization configuration"""
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

print("✅ Utility functions defined")
clear_memory()
check_gpu_memory()

✅ Utility functions defined
🖥️  GPU Memory - Allocated: 2.04GB, Reserved: 8.45GB


In [8]:
# Load tokenizer
print("🤖 Loading tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    trust_remote_code=True,
    padding_side="left"
)

# Add pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("✅ Added pad token")

print(f"✅ Tokenizer loaded: {config.model_name}")
print(f"   - Vocab size: {len(tokenizer)}")
print(f"   - Special tokens: pad={tokenizer.pad_token}, eos={tokenizer.eos_token}")

🤖 Loading tokenizer...
✅ Tokenizer loaded: Qwen/Qwen2.5-Coder-3B-Instruct
   - Vocab size: 151665
   - Special tokens: pad=<|endoftext|>, eos=<|im_end|>


In [None]:
# Fix bitsandbytes installation for Kaggle
!pip install -U bitsandbytes>=0.41.0
!pip install -U accelerate>=0.24.0

# Restart kernel after installation
import os
os._exit(00)

In [58]:
# Load model with quantization
print("🤖 Loading model with quantization...")

quantization_config = create_quantization_config()

model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="eager"  # Avoid flash attention compatibility issues
)

# Resize token embeddings if needed
if len(tokenizer) != model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))
    print(f"   - Resized token embeddings to {len(tokenizer)}")

print(f"✅ Model loaded: {config.model_name}")
print(f"   - Parameters: {model.num_parameters():,}")
print(f"   - Vocab size: {model.config.vocab_size}")
check_gpu_memory()

🤖 Loading model with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   - Resized token embeddings to 151665
✅ Model loaded: Qwen/Qwen2.5-Coder-3B-Instruct
   - Parameters: 3,085,383,680
   - Vocab size: 151665
🖥️  GPU Memory - Allocated: 3.96GB, Reserved: 8.45GB


In [59]:
# Setup LoRA fine-tuning
print("🔧 Setting up LoRA configuration...")

lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=config.target_modules,
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"✅ LoRA setup complete:")
print(f"   - Trainable params: {trainable_params:,}")
print(f"   - Total params: {total_params:,}")
print(f"   - Trainable percentage: {100 * trainable_params / total_params:.2f}%")

check_gpu_memory()

🔧 Setting up LoRA configuration...
✅ LoRA setup complete:
   - Trainable params: 14,966,784
   - Total params: 1,713,084,416
   - Trainable percentage: 0.87%
🖥️  GPU Memory - Allocated: 4.02GB, Reserved: 8.45GB


## 5. Training Configuration and Hyperparameters

Setting up training arguments and creating PyTorch datasets.

In [60]:
# Create PyTorch datasets
print("📊 Creating PyTorch datasets...")

train_dataset = UnitTestDataset(train_data, tokenizer, config.max_length)
val_dataset = UnitTestDataset(val_data, tokenizer, config.max_length)
test_dataset = UnitTestDataset(test_data, tokenizer, config.max_length)

print(f"✅ Datasets created:")
print(f"   - Training: {len(train_dataset):,} samples")
print(f"   - Validation: {len(val_dataset):,} samples")
print(f"   - Test: {len(test_dataset):,} samples")

📊 Creating PyTorch datasets...
Dataset initialized with 14,049 samples
Sample keys: ['task_id', 'question', 'code_ground_truth', 'code_generate', 'unit_tests']
Dataset initialized with 1,756 samples
Sample keys: ['task_id', 'question', 'code_ground_truth', 'code_generate', 'unit_tests']
Dataset initialized with 1,757 samples
Sample keys: ['task_id', 'question', 'code_ground_truth', 'code_generate', 'unit_tests']
✅ Datasets created:
   - Training: 14,049 samples
   - Validation: 1,756 samples
   - Test: 1,757 samples


In [61]:
# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

print("✅ Data collator created")

✅ Data collator created


In [62]:
# Create training arguments
training_args = TrainingArguments(
    output_dir=config.output_dir,
    run_name=config.run_name,
    
    # Training parameters
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    
    # Optimization
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    warmup_ratio=config.warmup_ratio,
    
    # Hardware optimization for P100
    fp16=True,
    bf16=False,
    gradient_checkpointing=False,
    dataloader_pin_memory=False,
    
    # Logging and saving
    logging_steps=config.logging_steps,
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    eval_strategy="steps",
    save_strategy="steps",
    
    # Model selection
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Disable external logging
    report_to=[]
)

print("✅ Training arguments created")
print(f"   - Total training steps: {len(train_dataset) // (config.train_batch_size * config.gradient_accumulation_steps) * config.num_epochs}")
print(f"   - Steps per epoch: {len(train_dataset) // (config.train_batch_size * config.gradient_accumulation_steps)}")

✅ Training arguments created
   - Total training steps: 2634
   - Steps per epoch: 878


In [63]:
# CRITICAL FIX: Enable gradients for LoRA parameters
print("🔧 Fixing LoRA parameter gradients...")

# Method 1: Enable gradients for all LoRA parameters
for name, param in model.named_parameters():
    if any(keyword in name for keyword in ['lora_A', 'lora_B', 'lora_embedding']):
        param.requires_grad = True
        print(f"✅ Enabled: {name}")

# Method 2: Alternative - enable all trainable parameters
model.train()
for param in model.parameters():
    if param.requires_grad:
        param.requires_grad = True

# Verify fix
trainable_count = sum(1 for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"📊 Verification:")
print(f"   - Trainable parameter tensors: {trainable_count}")
print(f"   - Trainable parameters: {total_params:,}")

if trainable_count == 0:
    print("❌ STILL NO TRAINABLE PARAMETERS!")
    # Force enable LoRA
    model.enable_adapters()
    print("🔄 Forced LoRA adapter activation")
else:
    print("✅ Ready for training!")

🔧 Fixing LoRA parameter gradients...
✅ Enabled: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
✅ Enabled: base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight
✅ Enabled: base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight
✅ Enabled: base_model.model.model.layers.0.mlp.up_proj.lora_B.default.we

In [52]:
# COMPLETE LoRA RESET
print("🔧 Setting up LoRA (FIXED VERSION)...")

from peft import get_peft_model, LoraConfig, TaskType

# Remove any existing adapters
if hasattr(model, 'peft_config'):
    model = model.merge_and_unload()

# Create fresh LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Force training mode
model.train()
model.enable_input_require_grads()
for param in model.parameters():
    if param.requires_grad:
        param.requires_grad = True

print("✅ LoRA setup complete - ready for training!")

🔧 Setting up LoRA (FIXED VERSION)...




trainable params: 29,933,568 || all params: 3,115,317,248 || trainable%: 0.9609
✅ LoRA setup complete - ready for training!


## 6. Model Training Loop

Running the complete training pipeline with progress monitoring.

In [64]:
# Create trainer
print("🏋️ Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("✅ Trainer created successfully")
check_gpu_memory()
clear_memory()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🏋️ Creating trainer...
✅ Trainer created successfully
🖥️  GPU Memory - Allocated: 4.02GB, Reserved: 8.45GB


In [65]:
# Start training
print("\n🔥 Starting training...")
print("=" * 60)
print(f"📊 Training Configuration Summary:")
print(f"   - Model: {config.model_name}")
print(f"   - Training samples: {len(train_dataset):,}")
print(f"   - Validation samples: {len(val_dataset):,}")
print(f"   - Epochs: {config.num_epochs}")
print(f"   - Batch size: {config.train_batch_size} (effective: {config.train_batch_size * config.gradient_accumulation_steps})")
print(f"   - Learning rate: {config.learning_rate}")
print(f"   - Max sequence length: {config.max_length}")
print(f"   - Steps per epoch: {len(train_dataset) // (config.train_batch_size * config.gradient_accumulation_steps)}")
print("=" * 60)

start_time = time.time()

try:
    # Train the model
    trainer.train()
    
    training_time = time.time() - start_time
    print(f"\n✅ Training completed successfully!")
    print(f"⏱️  Total training time: {training_time / 3600:.2f} hours ({training_time / 60:.1f} minutes)")
    
except Exception as e:
    print(f"❌ Training failed: {str(e)}")
    import traceback
    traceback.print_exc()
    
finally:
    clear_memory()
    check_gpu_memory()


🔥 Starting training...
📊 Training Configuration Summary:
   - Model: Qwen/Qwen2.5-Coder-3B-Instruct
   - Training samples: 14,049
   - Validation samples: 1,756
   - Epochs: 3
   - Batch size: 1 (effective: 16)
   - Learning rate: 0.0002
   - Max sequence length: 512
   - Steps per epoch: 878


Step,Training Loss,Validation Loss


🖥️  GPU Memory - Allocated: 2.16GB, Reserved: 8.46GB


KeyboardInterrupt: 

## 7. Model Evaluation and Testing

Evaluating model performance and generating sample unit tests.

In [None]:
# Final evaluation on test set
print("📊 Running final evaluation on test set...")

eval_results = trainer.evaluate(eval_dataset=test_dataset)

print(f"\n📈 Final Evaluation Results:")
print(f"   - Test Loss: {eval_results['eval_loss']:.4f}")
print(f"   - Test Perplexity: {np.exp(eval_results['eval_loss']):.2f}")

# Save evaluation results
with open(f"{config.output_dir}/eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)
    
print(f"✅ Evaluation results saved to {config.output_dir}/eval_results.json")

In [None]:
# Test model with sample generations
def test_model_generation(model, tokenizer, test_data: List[Dict], num_samples: int = 5):
    """Test the trained model with sample unit test generations"""
    print(f"🧪 Testing model with {num_samples} sample generations...\n")
    
    model.eval()
    
    # Select random test samples
    test_samples = np.random.choice(test_data, min(num_samples, len(test_data)), replace=False)
    
    for i, sample in enumerate(test_samples):
        print(f"{'='*80}")
        print(f"Test Sample {i+1}/{num_samples}")
        print(f"{'='*80}")
        
        # Extract code from sample
        if 'code' in sample:
            code = sample['code']
        else:
            # Try to extract from tokenized data
            input_ids = sample.get('input_ids', [])
            if input_ids:
                decoded = tokenizer.decode(input_ids[:200], skip_special_tokens=True)
                # Extract code portion (simplified)
                code = decoded.split('# Unit test:')[0].replace('# Generate a unit test for the following Python function:', '').strip()
            else:
                code = "Sample code not available"
        
        print(f"📝 Original Code:")
        print(f"{code[:300]}{'...' if len(code) > 300 else ''}\n")
        
        # Create generation prompt
        prompt = f"# Generate a unit test for the following Python function:\n{code}\n\n# Unit test:\n"
        
        # Tokenize prompt
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Generate unit test
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode generated response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_test = full_response[len(prompt):].strip()
        
        print(f"🤖 Generated Unit Test:")
        print(f"{generated_test}\n")
        
        # Show original unit test if available
        if 'unit_test' in sample:
            print(f"✅ Original Unit Test:")
            print(f"{sample['unit_test'][:300]}{'...' if len(sample['unit_test']) > 300 else ''}\n")
        
        print("\n")

# Run sample generations
test_model_generation(model, tokenizer, test_data, num_samples=3)

## 8. Save Trained Model

Saving the fine-tuned model and creating downloadable archives.

In [None]:
# Save the trained model
print("💾 Saving trained model...")

# Save model and tokenizer
trainer.save_model(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

print(f"✅ Model saved to: {config.output_dir}")

# Save training configuration
config_dict = {
    'model_name': config.model_name,
    'max_length': config.max_length,
    'train_batch_size': config.train_batch_size,
    'gradient_accumulation_steps': config.gradient_accumulation_steps,
    'num_epochs': config.num_epochs,
    'learning_rate': config.learning_rate,
    'lora_r': config.lora_r,
    'lora_alpha': config.lora_alpha,
    'lora_dropout': config.lora_dropout,
    'target_modules': config.target_modules,
    'training_samples': len(train_dataset),
    'validation_samples': len(val_dataset),
    'test_samples': len(test_dataset)
}

with open(f"{config.output_dir}/training_config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

print(f"✅ Training configuration saved")

In [None]:
# List saved files
print("📁 Saved model files:")
for file in os.listdir(config.output_dir):
    file_path = os.path.join(config.output_dir, file)
    if os.path.isfile(file_path):
        size_mb = os.path.getsize(file_path) / 1024 / 1024
        print(f"   - {file}: {size_mb:.1f} MB")
    else:
        print(f"   - {file}/ (directory)")

In [None]:
# Create downloadable archive
print("📦 Creating downloadable model archive...")

archive_name = "qwen-coder-unittest-model"
!cd /kaggle/working && tar -czf {archive_name}.tar.gz qwen-coder-finetune/

# Check archive size
archive_path = f"/kaggle/working/{archive_name}.tar.gz"
if os.path.exists(archive_path):
    archive_size = os.path.getsize(archive_path) / 1024 / 1024
    print(f"✅ Model archive created: {archive_name}.tar.gz ({archive_size:.1f} MB)")
    print(f"📥 Download from: /kaggle/working/{archive_name}.tar.gz")
else:
    print(f"❌ Failed to create archive")

# Final memory cleanup
clear_memory()
print("\n🎉 Training pipeline completed successfully!")
print(f"📁 Model saved in: {config.output_dir}")
print(f"📦 Download archive: {archive_name}.tar.gz")

In [None]:
# Training summary
print("\n" + "="*80)
print("🎯 TRAINING SUMMARY")
print("="*80)
print(f"📊 Dataset: {len(train_data) + len(val_data) + len(test_data):,} total samples")
print(f"🤖 Model: {config.model_name}")
print(f"🔧 Method: LoRA fine-tuning with 4-bit quantization")
print(f"⚡ Hardware: Kaggle P100 GPU")
print(f"📈 Training: {config.num_epochs} epochs, {len(train_dataset):,} samples")
print(f"💾 Output: {config.output_dir}")
print(f"📦 Archive: {archive_name}.tar.gz")
print("\n✅ Ready for deployment and inference!")
print("="*80)