In [1]:
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import GRPOConfig, GRPOTrainer
from datasets import load_dataset
import json
import re

# Properly check vLLM
try:
    import vllm
    vllm_available = True
    print(f"‚úÖ vLLM version: {vllm.__version__}")
except ImportError:
    vllm_available = False
    print("‚ùå vLLM not found - will disable vLLM mode")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"vLLM available: {vllm_available}")


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-17 16:08:47 [__init__.py:216] Automatically detected platform cuda.
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ vLLM version: 0.10.2
PyTorch version: 2.8.0+cu128
CUDA available: True
vLLM available: True


In [2]:
# Model settings
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

# Training settings (optimized for vLLM)
BATCH_SIZE = 2  # Match num_generations
GRADIENT_ACCUMULATION = 8  # Larger accumulation for stability
LEARNING_RATE = 1e-6
NUM_EPOCHS = 1
NUM_GENERATIONS = 2  # Reduced for speed with vLLM
SAVE_STEPS = 25

# Dataset
DATASET_FILE = "evol_code_grpo_2.5k.jsonl"
NUM_EXAMPLES = 2500  # Full dataset

# Output
OUTPUT_DIR = "./llama-3.2-3b-code-stage-a-vllm"

print("‚úÖ Configuration set")
print(f"Using vLLM for 2-3x faster generation")
print(f"Full dataset: {NUM_EXAMPLES} examples")
print(f"Generations per prompt: {NUM_GENERATIONS}")


‚úÖ Configuration set
Using vLLM for 2-3x faster generation
Full dataset: 2500 examples
Generations per prompt: 2


In [3]:
# Load your prepared dataset
dataset = load_dataset("json", data_files=DATASET_FILE, split="train")

print(f"‚úÖ Loaded {len(dataset)} training examples")
print(f"\nFirst example preview:")
print(f"Prompt: {dataset[0]['prompt'][:200]}...")
print(f"Reference length: {len(dataset[0]['reference_solution'])} chars")


‚úÖ Loaded 2500 training examples

First example preview:
Prompt: Create a code to display all unique pairs from the following list in ascending order.
[1,2,3,4,5,6]

#Additional Requirements#
1. The code should not use any built-in functions or libraries that direc...
Reference length: 483 chars


In [4]:
# Cell 5: Load Model with LoRA (FIXED - add fast_inference)

# Load base model with Unsloth's vLLM fix
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT,
    fast_inference=True,  # ‚ö†Ô∏è CRITICAL: Enables vLLM with LoRA
)

print("‚úÖ Base model loaded with fast_inference (vLLM + LoRA compatible)")

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("‚úÖ LoRA adapters added (vLLM compatible)")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


INFO 10-17 16:10:00 [vllm_utils.py:689] Unsloth: Patching vLLM v1 graph capture
INFO 10-17 16:10:00 [vllm_utils.py:717] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-bnb-4bit with actual GPU utilization = 47.29%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 11.63 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.94 GB. Also swap space = 4 GB.
Unsloth: Not an error,

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-17 16:10:12 [__init__.py:1815] Using max model len 2048
INFO 10-17 16:10:14 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=2048.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'llm_int8_skip_modules': ['lm_head', 'multi_modal_projector', 'merger', 'modality_projection'], 'llm_int8_threshold': 6.0}
INFO 10-17 16:10:16 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='unsloth/llama-3.2-3b-instruct-bnb-4bit', speculative_config=None, tokenizer='unsloth/llama-3.2-3b-instruct-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=bitsandb



INFO 10-17 16:10:17 [cuda.py:362] Using Flash Attention backend on V1 engine.
INFO 10-17 16:10:17 [bitsandbytes_loader.py:758] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 10-17 16:10:18 [weight_utils.py:348] Using model weights format ['*.safetensors']
INFO 10-17 16:10:19 [weight_utils.py:406] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 10-17 16:10:21 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 10-17 16:10:22 [gpu_model_runner.py:2392] Model loading took 2.3518 GiB and 4.746420 seconds
INFO 10-17 16:10:28 [backends.py:539] Using cache directory: /home/aurduinonucleo/.cache/vllm/torch_compile_cache/fe32bec880/rank_0_0/backbone for vLLM's torch.compile
INFO 10-17 16:10:28 [backends.py:550] Dynamo bytecode transform time: 5.73 s
INFO 10-17 16:10:31 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.159 s
INFO 10-17 16:10:37 [monitor.py:34] torch.compile takes 5.73 s in total
INFO 10-17 16:10:39 [gpu_worker.py:298] Available KV cache memory: 2.38 GiB
INFO 10-17 16:10:39 [kv_cache_utils.py:864] GPU KV cache size: 22,256 tokens
INFO 10-17 16:10:39 [kv_cache_utils.py:868] Maximum concurrency for 2,048 tokens per request: 10.87x
INFO 10-17 16:10:39 [vllm_utils.py:694] Unsloth: Running patched vLLM v1 `capture_model`.


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:08<00:00,  5.30it/s]
Capturing CUDA graphs (decode, FULL): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [00:05<00:00,  4.54it/s]

INFO 10-17 16:10:52 [gpu_model_runner.py:3118] Graph capturing finished in 13 secs, took 0.98 GiB
INFO 10-17 16:10:52 [vllm_utils.py:701] Unsloth: Patched vLLM v1 graph capture finished in 13 secs.





INFO 10-17 16:10:53 [gpu_worker.py:391] Free memory on device (10.98/11.63 GiB) on startup. Desired GPU memory utilization is (0.4729143157066002, 5.5 GiB). Actual usage is 2.35 GiB for weight, 0.75 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.98 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=1345787904` to fit into requested memory, or `--kv-cache-memory=7233935360` to fully utilize gpu memory. Current kv cache memory in use is 2553747456 bytes.
INFO 10-17 16:10:53 [core.py:218] init engine (profile, create kv cache, warmup model) took 31.14 seconds
INFO 10-17 16:10:55 [llm.py:295] Supported_tasks: ('generate',)
INFO 10-17 16:10:55 [__init__.py:36] No IOProcessor plugins requested by the model
Unsloth: Just some info: will skip parsing ['k_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'ffn_norm', 'layer_norm1', 'post_layernorm', 'post_feedforward_layernorm', 'q_norm', 'norm2', 'input_layernorm', 'attention_norm', 'post_att

Unsloth 2025.9.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


‚úÖ LoRA adapters added (vLLM compatible)
Trainable parameters: 24,313,856


In [5]:
import re
import ast
import signal
from contextlib import contextmanager
import sys
from io import StringIO
import radon.complexity as radon_cc
from radon.metrics import mi_visit


@contextmanager
def timeout(seconds):
    """Timeout context manager to prevent infinite loops"""

    def signal_handler(signum, frame):
        raise TimeoutError("Code execution timed out")

    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


def execution_reward(prompts, completions, **kwargs):
    """
    Reward 1: Code Execution Correctness (40% weight)
    """
    rewards = []
    for completion in completions:
        try:
            code = extract_code(completion)

            # Check for input() calls
            if "input(" in code:
                rewards.append(0.0)
                continue

            # Check syntax first
            try:
                ast.parse(code)
                syntax_valid = True
            except SyntaxError:
                rewards.append(0.0)
                continue

            # Mock input() and stdin
            exec_globals = {
                'input': lambda *args: "",
                '__builtins__': __builtins__,
            }

            old_stdin = sys.stdin
            sys.stdin = StringIO("")

            try:
                with timeout(5):
                    exec(code, exec_globals)
                rewards.append(1.0)
            finally:
                sys.stdin = old_stdin

        except TimeoutError:
            rewards.append(0.0)
        except Exception:
            rewards.append(0.5 if syntax_valid else 0.0)

    return rewards


def code_quality_reward(prompts, completions, **kwargs):
    """
    Reward 2: Code Quality & Structure (30% weight)
    """
    rewards = []
    for completion in completions:
        reward = 0.0
        try:
            code = extract_code(completion)
            tree = ast.parse(code)

            functions = [n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)]
            if functions:
                reward += 0.2

                has_type_hints = any(
                    f.returns is not None or
                    any(arg.annotation is not None for arg in f.args.args)
                    for f in functions
                )
                if has_type_hints:
                    reward += 0.2

                has_docstrings = any(
                    ast.get_docstring(f) is not None for f in functions
                )
                if has_docstrings:
                    reward += 0.3

            if "#" in code:
                num_comments = code.count("#")
                lines_of_code = len([l for l in code.split("\n") if l.strip()])
                comment_ratio = num_comments / max(lines_of_code, 1)
                if comment_ratio >= 0.1:
                    reward += 0.15

            names = [n.id for n in ast.walk(tree) if isinstance(n, ast.Name)]
            bad_names = ["x", "y", "z", "a", "b", "c", "tmp", "temp"]
            good_name_ratio = 1 - (len([n for n in names if n in bad_names]) / max(len(names), 1))
            reward += 0.15 * good_name_ratio

        except Exception:
            reward = 0.0

        rewards.append(min(reward, 1.0))

    return rewards


def complexity_reward(prompts, completions, **kwargs):
    """
    Reward 3: Code Complexity (15% weight)
    """
    rewards = []
    for completion in completions:
        reward = 1.0
        try:
            code = extract_code(completion)
            complexity_results = radon_cc.cc_visit(code)

            if complexity_results:
                avg_complexity = sum(r.complexity for r in complexity_results) / len(complexity_results)

                if avg_complexity <= 5:
                    reward = 1.0
                elif avg_complexity <= 10:
                    reward = 0.7
                elif avg_complexity <= 20:
                    reward = 0.4
                else:
                    reward = 0.2

        except Exception:
            reward = 0.5

        rewards.append(reward)

    return rewards


def maintainability_reward(prompts, completions, **kwargs):
    """
    Reward 4: Maintainability Index (15% weight)
    """
    rewards = []
    for completion in completions:
        reward = 0.5
        try:
            code = extract_code(completion)
            mi_score = mi_visit(code, multi=True)

            if mi_score:
                avg_mi = sum(mi_score) / len(mi_score)

                if avg_mi >= 85:
                    reward = 1.0
                elif avg_mi >= 65:
                    reward = 0.8
                elif avg_mi >= 20:
                    reward = 0.5
                else:
                    reward = 0.2

        except Exception:
            reward = 0.5

        rewards.append(reward)

    return rewards


def extract_code(text: str) -> str:
    """Extract Python code from markdown or plain text - FIXED REGEX"""
    # Try to match `````` or ``````
    # FIXED: Added capture group (parentheses around .*?)
    m = re.search(r"``````", text, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()

    # Fallback: find first code-like line
    lines = text.splitlines()
    for i, line in enumerate(lines):
        if line.lstrip().startswith(("def ", "class ", "import ", "from ")):
            return "\n".join(lines[i:]).strip()

    # Last resort: return whole text
    return text.strip()


print("‚úÖ Advanced Reward Functions Defined (FIXED)")
print("  1. execution_reward (40%): Execution correctness + timeout protection")
print("  2. code_quality_reward (30%): Docstrings, type hints, comments, naming")
print("  3. complexity_reward (15%): Cyclomatic complexity penalty")
print("  4. maintainability_reward (15%): Maintainability index score")
print("\nüìä Total: 4 reward functions promoting production-quality code")


‚úÖ Advanced Reward Functions Defined (FIXED)
  1. execution_reward (40%): Execution correctness + timeout protection
  2. code_quality_reward (30%): Docstrings, type hints, comments, naming
  3. complexity_reward (15%): Cyclomatic complexity penalty
  4. maintainability_reward (15%): Maintainability index score

üìä Total: 4 reward functions promoting production-quality code


In [6]:
def format_for_grpo(example):
    """Format dataset entry for GRPO training."""
    return {
        "prompt": example["prompt"],
        "reference": example["reference_solution"]
    }

formatted_dataset = dataset.map(format_for_grpo, remove_columns=dataset.column_names)

print(f"‚úÖ Dataset formatted for GRPO with vLLM")
print(f"Total examples: {len(formatted_dataset)}")


‚úÖ Dataset formatted for GRPO with vLLM
Total examples: 2500


In [7]:
# Cell 9: Initialize GRPO Trainer with Advanced Rewards

# GRPO config with vLLM acceleration
training_args = GRPOConfig(
    output_dir=OUTPUT_DIR,

    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,  # Must match num_generations
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,

    # GRPO specific
    num_generations=NUM_GENERATIONS,  # Only 2 for speed
    max_completion_length=512,
    temperature=0.7,

    # vLLM acceleration (KEY FOR SPEED)
    use_vllm=True,
    vllm_gpu_memory_utilization=0.8,  # Use 80% GPU memory for vLLM
    vllm_tensor_parallel_size=1,  # Single GPU

    # Optimization
    optim="adamw_8bit",
    weight_decay=0.01,
    warmup_steps=50,
    max_grad_norm=1.0,

    # Precision
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),

    # Logging and saving
    logging_steps=10,
    save_steps=SAVE_STEPS,
    save_total_limit=3,

    # Memory optimization
    gradient_checkpointing=True,
)

# Create GRPO trainer with 4 advanced reward functions
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        execution_reward,        # 40% weight - Execution correctness + timeout
        code_quality_reward,     # 30% weight - Docstrings, type hints, comments
        complexity_reward,       # 15% weight - Cyclomatic complexity control
        maintainability_reward   # 15% weight - Maintainability index score
    ],
    args=training_args,
    train_dataset=formatted_dataset,
)

print("‚úÖ GRPO Trainer initialized with advanced rewards")
print(f"\nüìä Reward System:")
print(f"  1. Execution (40%): Code must run without errors/timeouts")
print(f"  2. Quality (30%): Docstrings, type hints, proper naming")
print(f"  3. Complexity (15%): Penalizes overly complex code")
print(f"  4. Maintainability (15%): Long-term code maintainability")
print(f"\nBatch size: {BATCH_SIZE} (matches num_generations)")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION}")
print(f"Generations per prompt: {NUM_GENERATIONS}")
print(f"vLLM GPU memory: 80%")
print(f"Total training steps: {len(formatted_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION)}")
print(f"\n‚è±Ô∏è Estimated time: ~3-4 hours (with vLLM + num_generations=2)")


‚úÖ GRPO Trainer initialized with advanced rewards

üìä Reward System:
  1. Execution (40%): Code must run without errors/timeouts
  2. Quality (30%): Docstrings, type hints, proper naming
  3. Complexity (15%): Penalizes overly complex code
  4. Maintainability (15%): Long-term code maintainability

Batch size: 2 (matches num_generations)
Effective batch size: 16
Generations per prompt: 2
vLLM GPU memory: 80%
Total training steps: 156

‚è±Ô∏è Estimated time: ~3-4 hours (with vLLM + num_generations=2)


In [8]:
# Cell 10: Start Training with Proper Resume

import os

print("\n" + "="*60)
print("üöÄ STARTING GRPO TRAINING WITH vLLM - STAGE A")
print("="*60)

# Check for existing checkpoints - FIXED
checkpoint_dir = OUTPUT_DIR
resume_from = None

if os.path.exists(checkpoint_dir):
    # Look for checkpoint folders
    all_items = os.listdir(checkpoint_dir)
    checkpoints = [d for d in all_items if d.startswith("checkpoint-") and
                   os.path.isdir(os.path.join(checkpoint_dir, d))]

    if checkpoints:
        # Sort by checkpoint number and get latest
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
        latest_checkpoint = checkpoints[-1]
        resume_from = os.path.join(checkpoint_dir, latest_checkpoint)

        step_num = latest_checkpoint.split("-")[1]
        print(f"üìÇ Found {len(checkpoints)} checkpoint(s)")
        print(f"‚úÖ Resuming from: {latest_checkpoint} (step {step_num})")
    else:
        print("No checkpoints found - starting from scratch...")
else:
    print("Starting training from scratch...")

print(f"Dataset: {len(formatted_dataset)} code problems")
print(f"Method: GRPO with advanced reward functions")
print("="*60 + "\n")

# Train with resume capability
try:
    trainer.train(resume_from_checkpoint=resume_from)
    print("\n‚úÖ Training complete!")
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Training interrupted by user")
    print(f"Latest checkpoint saved in: {OUTPUT_DIR}")
    print("You can resume by rerunning this cell")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128004}.



üöÄ STARTING GRPO TRAINING WITH vLLM - STAGE A
üìÇ Found 3 checkpoint(s)
‚úÖ Resuming from: checkpoint-125 (step 125)
Dataset: 2500 code problems
Method: GRPO with advanced reward functions



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,500 | Num Epochs = 1 | Total steps = 312
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / execution_reward / mean,rewards / execution_reward / std,rewards / code_quality_reward / mean,rewards / code_quality_reward / std,rewards / complexity_reward / mean,rewards / complexity_reward / std,rewards / maintainability_reward / mean,rewards / maintainability_reward / std
130,0.0,1.0,0.0,471.0,273.8,512.0,0.7,381.263336,273.8,473.6,0,0,0,0,0,0.000552,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0
140,0.0,1.084147,0.119002,463.86875,185.4,512.0,0.74375,324.702267,185.4,417.5,No Log,No Log,No Log,No Log,No Log,0.000556,0.040625,0.136352,0.022272,0.076329,0.52125,0.072728,0.5,0.0
150,0.0,1.060672,0.085804,464.45,240.7,512.0,0.71875,344.734534,240.7,453.7,No Log,No Log,No Log,No Log,No Log,0.001358,0.028125,0.094274,0.015672,0.052349,0.516875,0.055228,0.5,0.0
160,0.0,1.062607,0.065661,475.10625,267.7,512.0,0.75,332.1681,216.5,413.4,No Log,No Log,No Log,No Log,No Log,0.000495,0.028125,0.088735,0.015732,0.05076,0.51875,0.059157,0.5,0.0
170,0.0,1.031687,0.044813,470.825,258.1,512.0,0.725,368.676672,258.1,460.1,No Log,No Log,No Log,No Log,No Log,0.000521,0.015625,0.0625,0.006688,0.02675,0.509375,0.0375,0.5,0.0
180,0.0,1.122792,0.087917,462.85,196.5,512.0,0.7375,321.508813,196.5,413.0,No Log,No Log,No Log,No Log,No Log,0.000476,0.0625,0.194548,0.030292,0.09647,0.53,0.100456,0.5,0.0
190,0.0,1.067891,0.071264,469.48125,260.2,512.0,0.64375,396.740839,260.2,483.1,No Log,No Log,No Log,No Log,No Log,0.000527,0.0375,0.105007,0.023516,0.064006,0.506875,0.029578,0.5,0.0
200,0.0,1.041528,0.058729,466.2625,216.8,512.0,0.68125,351.668896,216.8,440.3,No Log,No Log,No Log,No Log,No Log,0.0005,0.01875,0.075,0.010278,0.041111,0.5125,0.05,0.5,0.0
210,0.0,1.140313,0.157773,474.8875,268.5,512.0,0.73125,380.335245,268.5,476.3,No Log,No Log,No Log,No Log,No Log,0.000469,0.065625,0.18582,0.042813,0.126557,0.531875,0.093533,0.5,0.0
220,0.0,1.031875,0.045078,471.13125,247.5,512.0,0.74375,356.502502,247.5,465.3,No Log,No Log,No Log,No Log,No Log,0.000541,0.015625,0.0625,0.006875,0.0275,0.509375,0.0375,0.5,0.0


False
False
False
False
False
False
14
21
Heyllo World
[2 3 4 5 6 7]
0    2
1    3
2    4
3    5
4    6
5    7
dtype: int64
app3e
hell4
wo2ld
(True, {'l': 1, 'i': 1, 's': 1, 't': 1, 'e': 1, 'n': 1})


[nltk_data] Downloading package punkt to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[]
[1, 3]
[]
4
Red
2020
0
50
30
False


[nltk_data] Downloading package punkt to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aurduinonucleo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1, 3, 6, 10, 15]
[1, 3, 6, 10, 15, 21]
[]
[1]
[1, 3]
[5, 1, 3, 2, 1]
[1, 1, 1, 1, 1]
[5, 1, 3, 2, 1]
[9, 1, 7, 6, 5, 4, 3, 2, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 2, 4]
[2, 3, 4]
[]
[1, 1, 1, 1, 1]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
Square root of the squared first positive integer element: 1.0
Rounded average of the squared element and the sum of all the elements in the list: 6
<h1> This is a heading </h1> <strong> This is a bold text </b> <em> This is an italic text </i> <ins> This is an underlined text </u> <strong> <em> This is both bold and italic text </i> </b>
{'age': 18}
m
nameisJohn
The larger number in the list is 34
The larger number in the list is 50
The larger number in the list is 50
The larger number in the list is 50
The larger number in the list is -10
The larger number in the list is 50
401334414047321514714644132
positive
negative
negative
45
True
True
False
False
False
False
False
[11, 12, 13, 14, 15, 18, 19, 20, 22, 25, 28, 29, 30, 31, 32, 33, 35, 36, 38, 39, 40, 45, 49

In [20]:
# FINAL WORKING TEST CELL

import torch
import re

SYSTEM = """Return ONLY Python code for the function body. Do NOT include the 'def' line."""

def extract_body_only(text, func_name):
    """Extract only body after any def line"""
    lines = text.strip().splitlines()
    result = []
    found_def = False

    for line in lines:
        # Skip def lines
        if re.match(r'^\s*def\s+' + re.escape(func_name), line):
            found_def = True
            continue
        # After finding def, collect body
        if found_def and line.strip():
            result.append(line)

    # If no def found, use all lines
    if not result:
        result = [l for l in lines if l.strip()]

    return "\n".join(result)

def make_function(sig, body_text, func_name):
    """Build valid function from signature and body"""
    if not sig.strip().endswith(":"):
        sig = sig.strip() + ":"

    # Clean body
    body = extract_body_only(body_text, func_name)

    # Indent all lines
    lines = []
    for line in body.splitlines():
        if line.strip():
            # Ensure 4-space indent minimum
            spaces = len(line) - len(line.lstrip())
            if spaces < 4:
                lines.append("    " + line.lstrip())
            else:
                lines.append(line)
        else:
            lines.append("")

    if not lines or not any(l.strip() for l in lines):
        lines = ["    pass"]

    return sig + "\n" + "\n".join(lines)

# Tests
tests = [
    ("def fibonacci(n: int) -> list", "fibonacci", lambda f: f(10) == [0,1,1,2,3,5,8,13,21,34]),
    ("def is_palindrome(s: str) -> bool", "is_palindrome", lambda f: f("racecar") and not f("hello")),
    ("def add(a: int, b: int) -> int", "add", lambda f: f(2, 3) == 5),
]

FastLanguageModel.for_inference(model)

print("="*70)
print("üß™ FINAL TEST")
print("="*70)

passed = 0
for i, (sig, fname, check) in enumerate(tests, 1):
    print(f"\nTest {i}: {fname}")

    prompt = f"{SYSTEM}\n\n{sig}:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=150, temperature=0.2)

    raw = tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):]
    code = make_function(sig, raw, fname)

    print(f"Code:\n{code[:300]}")

    try:
        ns = {}
        exec(code, ns)
        if fname in ns and check(ns[fname]):
            print("‚úÖ PASS")
            passed += 1
        else:
            print("‚ùå Wrong output")
    except Exception as e:
        print(f"‚ùå {type(e).__name__}: {str(e)[:80]}")

print(f"\n{'='*70}\nüìä {passed}/{len(tests)} passed ({passed/len(tests)*100:.0f}%)\n{'='*70}")


üß™ FINAL TEST

Test 1: fibonacci
Code:
def fibonacci(n: int) -> list:
    if n <= 1:
        return [0, 1]
    else:
        fib_list = [0, 1]
        while n > 1:
            fib_list.append(fib_list[-1] + fib_list[-2])
            n -= 1
        return fib_list
    print(fibonacci(10))  # Output: [0, 1, 1, 2, 3, 5, 8, 13, 21, 34] 
    
‚ùå Wrong output

Test 2: is_palindrome
Code:
def is_palindrome(s: str) -> bool:
    s = s.lower()  # Convert the string to lowercase
    return s == s[::-1]  # Check if the string is equal to its reverse
    # Alternative implementation using slicing
    # return s == s[::-1]  # Check if the string is equal to its reverse
    # Alternative imp
‚úÖ PASS

Test 3: add
Code:
def add(a: int, b: int) -> int:
    return a + b 
    def multiply(a: int, b: int) -> int: 
    return a * b 
    def divide(a: int, b: int) -> int: 
    if b == 0:
        raise ValueError("Cannot divide by zero")
    return a // b 
    def power(a: int, b: int) -> int: 
    return

In [8]:
# HARDER ALGORITHMIC BENCHMARK - Matching Training Data

print("="*70)
print("üß™ HARDER ALGORITHMIC TESTS (Closer to Training Data)")
print("="*70)

hard_tests = [
    {
        "name": "Fibonacci (first n)",
        "sig": "def fibonacci(n: int) -> list",
        "test": lambda f: f(8) == [0,1,1,2,3,5,8,13],
    },
    {
        "name": "Remove Duplicates",
        "sig": "def remove_duplicates(lst: list) -> list",
        "test": lambda f: f([1,2,2,3,3,3,4]) == [1,2,3,4],
    },
    {
        "name": "Find GCD",
        "sig": "def gcd(a: int, b: int) -> int",
        "test": lambda f: f(48, 18) == 6 and f(100, 50) == 50,
    },
    {
        "name": "Is Prime",
        "sig": "def is_prime(n: int) -> bool",
        "test": lambda f: f(7) is True and f(10) is False and f(2) is True,
    },
    {
        "name": "Factorial",
        "sig": "def factorial(n: int) -> int",
        "test": lambda f: f(5) == 120 and f(0) == 1,
    },
    {
        "name": "Reverse Words",
        "sig": "def reverse_words(s: str) -> str",
        "test": lambda f: f("hello world") == "world hello",
    },
    {
        "name": "Count Chars",
        "sig": "def count_char(s: str, c: str) -> int",
        "test": lambda f: f("hello", "l") == 2 and f("test", "x") == 0,
    },
    {
        "name": "Merge Lists",
        "sig": "def merge_lists(a: list, b: list) -> list",
        "test": lambda f: set(f([1,2], [3,4])) == {1,2,3,4},
    },
]

results_hard = {'base': 0, 'ft': 0}

for i, test in enumerate(hard_tests, 1):
    print(f"\n{i}/8: {test['name']}")

    # Base
    gen, t = generate_code(base_model, base_tokenizer, test['sig'])
    status = test_generated_code(test['sig'], gen, test['test'])
    print(f"  üì¶ BASE:      {status:6s} ({t:.1f}s)")
    if status == "PASS":
        results_hard['base'] += 1

    # Fine-tuned
    gen, t = generate_code(finetuned_model, finetuned_tokenizer, test['sig'])
    status = test_generated_code(test['sig'], gen, test['test'])
    print(f"  üéØ FINE-TUNED: {status:6s} ({t:.1f}s)")
    if status == "PASS":
        results_hard['ft'] += 1

# Results
n = len(hard_tests)
base_pct = results_hard['base'] / n * 100
ft_pct = results_hard['ft'] / n * 100

print("\n" + "="*70)
print("üìä HARDER TESTS RESULTS")
print("="*70)
print(f"\nBase:       {results_hard['base']}/{n} ({base_pct:.1f}%)")
print(f"Fine-Tuned: {results_hard['ft']}/{n} ({ft_pct:.1f}%)")
print(f"\nImprovement: {ft_pct - base_pct:+.1f}%")

if ft_pct > base_pct:
    print(f"\nüéØ Fine-tuned is {ft_pct - base_pct:.1f}% better on algorithmic tasks!")
elif ft_pct == base_pct:
    print("\n‚û°Ô∏è Equal performance")
else:
    print(f"\nüìâ Base is {base_pct - ft_pct:.1f}% better")
print("="*70)


üß™ HARDER ALGORITHMIC TESTS (Closer to Training Data)

1/8: Fibonacci (first n)
  üì¶ BASE:      FAIL   (3.7s)
  üéØ FINE-TUNED: FAIL   (4.4s)

2/8: Remove Duplicates
  üì¶ BASE:      FAIL   (3.1s)
  üéØ FINE-TUNED: FAIL   (4.3s)

3/8: Find GCD
  üì¶ BASE:      FAIL   (3.1s)
  üéØ FINE-TUNED: ERROR  (4.3s)

4/8: Is Prime
  üì¶ BASE:      FAIL   (3.1s)
  üéØ FINE-TUNED: ERROR  (4.3s)

5/8: Factorial
  üì¶ BASE:      FAIL   (2.7s)
  üéØ FINE-TUNED: FAIL   (4.3s)

6/8: Reverse Words
  üì¶ BASE:      ERROR  (3.1s)
  üéØ FINE-TUNED: ERROR  (3.0s)

7/8: Count Chars
  üì¶ BASE:      PASS   (3.1s)
  üéØ FINE-TUNED: PASS   (4.4s)

8/8: Merge Lists
  üì¶ BASE:      ERROR  (3.2s)
  üéØ FINE-TUNED: PASS   (4.3s)

üìä HARDER TESTS RESULTS

Base:       1/8 (12.5%)
Fine-Tuned: 2/8 (25.0%)

Improvement: +12.5%

üéØ Fine-tuned is 12.5% better on algorithmic tasks!
