# octo-llm GRPO Fine-Tune

**Qwen3-1.7B** → QLoRA + GRPO (compiler-as-reward-model) → Merge → Q5_K_M GGUF

The LLM doesn't need to know how to code — it needs to understand what the user wants.
OctoFlow's compiler handles correctness. GRPO trains the model to generate code the compiler accepts.

**How it works:**
1. Sample N completions per prompt from Qwen3-1.7B
2. Score each with OctoFlow syntax checker (reward function)
3. Group-rank: better completions get higher advantage
4. Policy gradient update — model learns what makes correct OctoFlow

**No labeled data needed — the compiler IS the teacher.**

Attach dataset `octoflow-training-data`, enable **GPU T4**, Internet ON, then Run All.

In [None]:
# ── Install + GPU Check ──
!pip install -q "transformers>=4.45" "peft>=0.13" "trl>=0.15" "bitsandbytes>=0.44" datasets accelerate sentencepiece

import torch, os, glob, json, random, re, gc

assert torch.cuda.is_available(), "Enable GPU in Kaggle settings!"
gpu_name = torch.cuda.get_device_name(0)
gpu_free = torch.cuda.mem_get_info()[0] / 1024**3
print(f"GPU: {gpu_name} ({gpu_free:.1f} GB free)")

In [None]:
# ── Configuration ──
MODEL_NAME = "Qwen/Qwen3-1.7B"

# Paths
OUTPUT_DIR   = "/kaggle/working/octo-llm-grpo"
ADAPTER_DIR  = "/kaggle/working/octo-llm-grpo-adapter"
MERGED_DIR   = "/kaggle/working/octo-llm-merged"
GGUF_DIR     = "/kaggle/working/octo-llm-gguf"

# QLoRA
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# GRPO
NUM_PROMPTS       = 3000   # prompts to train on (subsampled from 52K bank)
NUM_GENERATIONS   = 4      # completions per prompt
MAX_COMPLETION_LEN = 384   # max tokens per completion
TEMPERATURE       = 0.8    # sampling temperature for diversity
GRPO_LR           = 5e-6   # learning rate (conservative for RL)
GRPO_EPOCHS       = 2
GRPO_BATCH        = 2      # prompts per device per step
GRPO_GRAD_ACCUM   = 4      # effective batch = 8 prompts = 32 completions

# Reward weights (max total = 5.0)
W_STRUCTURE   = 1.0   # has OctoFlow keywords/structure
W_BLOCKS      = 1.0   # blocks properly matched (fn..end, for..end)
W_NO_FOREIGN  = 1.0   # no Python/JS/C++ syntax
W_IDIOMS      = 1.0   # correct OctoFlow idioms
W_CONCISE     = 0.5   # conciseness bonus
W_NO_MISTAKES = 0.5   # avoids known LLM mistakes
MAX_REWARD    = 5.0

steps_per_epoch = NUM_PROMPTS // (GRPO_BATCH * GRPO_GRAD_ACCUM)
print(f"Config: {MODEL_NAME}")
print(f"GRPO: {NUM_PROMPTS} prompts × {NUM_GENERATIONS} gens = {NUM_PROMPTS * NUM_GENERATIONS} completions/epoch")
print(f"Steps/epoch: {steps_per_epoch}, Total: {steps_per_epoch * GRPO_EPOCHS}")
print(f"Est time on T4: ~{steps_per_epoch * GRPO_EPOCHS * 45 / 3600:.1f}h (at ~45s/step)")

In [None]:
# ── L0 Core Knowledge + System Prompt ──
# This is the core OctoFlow syntax reference, always included in the prompt.
# The context engine provides more detail at inference time, but L0 is the foundation.

L0_CORE = """\
# OctoFlow Core

## Syntax
let x = 42.0              // immutable
let mut y = 0.0            // mutable
fn add(a, b)
    return a + b
end
if x > 0
    print("positive")
elif x == 0
    print("zero")
else
    print("negative")
end
for i in range(0, 10)
    print("{i}")
end
while y < 100
    y = y + 1
end
use "math"                 // import module

## Types
float (f32): 42.0 | int (i64): 42 | string: "hello {name}" | array: [] | map: map() | none
Booleans: 1.0=true, 0.0=false. int+float→float. Operators: + - * / % == != < > <= >= && || !

## Core Builtins
print("text {var}") len(x) type_of(x) int(x) float(x) str(x) is_none(x)
abs(x) sqrt(x) pow(x,n) round(x) floor(x) ceil(x) random()
push(arr, val) pop(arr) slice(arr, s, e) sort_array(arr)
filter(arr, fn(x) cond end) map_each(arr, fn(x) expr end)
let mut m = map() | m["key"] = val | map_get(m, "key") | map_keys(m)
let r = try(expr)  // r.ok r.value r.error

## Rules
- All blocks end with `end`. No braces. No semicolons.
- print() ONLY takes strings: print("{x}") never print(x)
- Functions not methods: push(arr, x) not arr.push(x)
- Modules: use "data/csv" use "stats/descriptive" use "web/http"
"""

SYSTEM_PROMPT = (
    "You are octo-llm, an AI assistant for OctoFlow. "
    "Users describe what they want, and you generate correct OctoFlow code. "
    "Respond ONLY with OctoFlow code — no explanations, no markdown fences.\n\n"
    + L0_CORE
)

print(f"System prompt: {len(SYSTEM_PROMPT)} chars (~{len(SYSTEM_PROMPT)//4} tokens)")

In [None]:
# ── Load Prompt Bank + Chain Discovery Prompts ──
import zipfile

# Discover training data (handles various Kaggle mount points)
def find_data_file(name):
    for base in ["/kaggle/input", "/kaggle/working/training-data"]:
        for match in glob.glob(f"{base}/**/{name}", recursive=True):
            return match
    return None

# Try to find data; if not found, try extracting from zip
batch1 = find_data_file("batch_001.jsonl")
if not batch1:
    zip_matches = glob.glob("/kaggle/input/**/*.zip", recursive=True)
    for zf in zip_matches:
        extract_to = "/kaggle/working/training-data"
        if not os.path.exists(extract_to):
            print(f"Extracting {zf}...")
            os.makedirs(extract_to, exist_ok=True)
            with zipfile.ZipFile(zf, "r") as z:
                z.extractall(extract_to)
    batch1 = find_data_file("batch_001.jsonl")

assert batch1, "Training data not found! Attach octoflow-training-data dataset."

def extract_prompts(files):
    """Extract unique prompts from JSONL files."""
    prompts = []
    seen = set()
    for fpath in files:
        if not fpath or not os.path.exists(fpath):
            continue
        with open(fpath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                p = obj.get("prompt", "")
                if p and len(p) > 10 and p not in seen:
                    # Skip error-fix prompts that are too long (> 500 chars)
                    if len(p) < 500:
                        prompts.append(p)
                        seen.add(p)
    return prompts

data_files = [find_data_file(n) for n in [
    "batch_001.jsonl", "batch_002_medium.jsonl",
    "batch_003_hard.jsonl", "batch_004_post3b.jsonl",
    "dpo_001.jsonl",
]]
all_prompts = extract_prompts(data_files)
print(f"Loaded {len(all_prompts)} unique prompts from training data")

# ── Chain Discovery Prompts ──
# Compositional tasks that require chaining multiple OctoFlow modules.
# The model learns to compose primitives: csv+stats, image+transform, http+json, etc.
CHAIN_PROMPTS = [
    # Data pipelines
    "Read a CSV file, filter rows where price > 100, compute the average, and print it",
    "Load CSV data, sort by date column, compute running total of amounts, print final total",
    "Read a JSON config file, extract the database host and port, print connection string",
    "Parse a CSV with headers, group rows by category, count each group, print summary table",
    "Read two CSV files, merge them by matching id column, export combined result",
    "Load a dataset, remove rows with missing values, compute mean of each column, print report",
    "Read a log file line by line, parse timestamps, count entries per hour, print histogram",
    "Import CSV data, compute min max mean stddev for price column, print statistics summary",
    "Read a JSON array file, filter objects where status equals active, count and print result",
    "Load CSV sales data, compute total revenue per product, sort descending, print top 5",
    "Read a text file, split into words, count unique words, print the 10 most frequent",
    "Parse CSV stock data, compute daily returns from close prices, print mean return",
    # Web + data
    "Create an HTTP server that reads config from a JSON file and responds with formatted data",
    "Build a REST API with GET /items that reads from a CSV and returns JSON",
    "Create an HTTP health check endpoint that returns server uptime as JSON",
    "Build a webhook receiver that logs incoming POST data to a CSV file",
    "Create an HTTP server with two endpoints: GET /status and POST /data that saves to file",
    # ML/Stats chains
    "Load stock prices from CSV, compute 20-day SMA and 20-day EMA, print both values",
    "Read a dataset, split 80/20 into train and test sets, train KNN with k=5, print accuracy",
    "Generate 1000 random numbers, compute mean median and standard deviation, print results",
    "Load two price arrays, compute their Pearson correlation coefficient, print it",
    "Read sensor data from CSV, compute z-scores, flag readings where abs z-score > 3",
    "Compute RSI for stock closing prices with period 14, print the latest RSI value",
    "Load a dataset with x and y columns, run linear regression, print slope and intercept",
    "Generate random data points, run k-means clustering with k=3, print cluster centers",
    "Read exam scores from CSV, compute percentile ranks for each student, print results",
    "Load temperature data, compute 7-day moving average, print smoothed values",
    # Image/media chains
    "Load a BMP image, convert each pixel to grayscale by averaging RGB channels, save result",
    "Create a 256x256 gradient image from black to white, save as BMP file",
    "Read a BMP image, flip it horizontally by reversing each row, save the mirrored image",
    "Generate a sine wave audio signal at 440 Hz for 2 seconds, save as WAV file",
    "Load a BMP image, increase brightness by adding 30 to each channel, clamp to 255, save",
    # GPU chains
    "Generate two random vectors with 10000 elements on GPU, compute their dot product, print it",
    "Create two 100x100 matrices on GPU, multiply them, print the sum of the result",
    "Generate 1 million random numbers on GPU, compute their mean and max, print both",
    "Compute softmax of a 1000-element vector on GPU, verify values sum to approximately 1.0",
    "Upload an array to GPU, scale every element by 3.5, download result, print first 10 values",
    # Crypto + encoding
    "Read a file as string, compute its SHA-256 hash, print the hex digest",
    "Generate a random UUID, base64-encode it, print both original and encoded form",
    "Read a password string, hash it with SHA-256, compare with a stored hash, print match result",
    "Encode a string as base64, decode it back, verify round-trip produces original string",
    # Multi-domain compositions
    "Create a function to validate email addresses using string operations, test with 5 examples",
    "Implement binary search on a sorted array of 1000 integers, print whether target was found",
    "Build a priority queue using an array with push and pop-min operations, test with 10 items",
    "Read environment variables for HOST and PORT, validate they exist, print server address",
    "Create a timer that measures execution time of sorting 10000 numbers, print elapsed seconds",
    "Build a simple key-value store with set get and delete functions, demonstrate usage",
    "Generate multiplication table for 1 through 12, format as aligned columns, print it",
    "Implement Caesar cipher with encrypt and decrypt functions, test with a sample message",
    "Read a CSV file, pivot data by category column, compute sum per category, export as JSON",
    "Generate Fibonacci sequence up to 50 terms, filter for even numbers, print them",
    "Load a text file, count sentences by splitting on periods, compute average sentence length",
    "Create a function to check if a string is a palindrome, test with 5 example strings",
    "Read command-line args, parse named flags, print each flag name and value",
    "Build a simple calculator that evaluates addition subtraction multiplication division from input",
    "Generate 100 random integers, sort them, use binary search to find specific values",
]

all_prompts.extend(CHAIN_PROMPTS)
print(f"Added {len(CHAIN_PROMPTS)} chain discovery prompts → {len(all_prompts)} total")

# Subsample for GRPO training
random.seed(42)
random.shuffle(all_prompts)
train_prompts = all_prompts[:NUM_PROMPTS]
print(f"Using {len(train_prompts)} prompts for GRPO training")

# Build dataset with chat format
from datasets import Dataset
dataset_rows = [{"prompt": [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": p},
]} for p in train_prompts]

train_dataset = Dataset.from_list(dataset_rows)
print(f"Dataset ready: {len(train_dataset)} rows")
print(f"Sample: {train_dataset[0]['prompt'][1]['content'][:80]}...")

In [None]:
# ── OctoFlow Reward Function ──
# The compiler-as-reward-model: scores generated OctoFlow code quality.
# No human labeling needed — syntax rules are the teacher.

def extract_flow_code(text):
    """Extract OctoFlow code from LLM response."""
    if not text or not text.strip():
        return ""
    # Try ```flow ... ``` or ```octoflow ... ```
    m = re.search(r'```(?:flow|octoflow)\s*\n(.*?)```', text, re.DOTALL)
    if m:
        return m.group(1).strip()
    # Try generic ``` ... ```
    m = re.search(r'```\s*\n(.*?)```', text, re.DOTALL)
    if m:
        return m.group(1).strip()
    # Raw code — if it looks like OctoFlow, use as-is
    if any(kw in text for kw in ['let ', 'fn ', 'print(', 'for ', 'use "']):
        return text.strip()
    return ""

def score_octoflow(code):
    """Score OctoFlow code quality. Returns 0.0 to MAX_REWARD."""
    if not code or not code.strip():
        return 0.0

    lines = [l for l in code.split('\n') if l.strip()]
    if not lines:
        return 0.0

    score = 0.0

    # ── 1. Structure (+1.0): Has OctoFlow keywords ──
    octo_keywords = ('let ', 'let mut ', 'fn ', 'print(', 'println(',
                     'for ', 'if ', 'while ', 'use "', 'match ', 'struct ')
    has_kw = any(l.strip().startswith(octo_keywords) for l in lines)
    if has_kw:
        score += W_STRUCTURE
    elif len(lines) >= 2:
        score += W_STRUCTURE * 0.3  # has content, just no obvious keywords

    # ── 2. Block matching (+1.0): fn/for/if/while..end ──
    openers = 0
    closers = 0
    for l in lines:
        s = l.strip()
        if any(s.startswith(kw) for kw in ('fn ', 'for ', 'while ', 'match ')):
            openers += 1
        elif s.startswith('if ') and not s.startswith('if r.'):
            openers += 1
        if s == 'end':
            closers += 1
    if openers == closers:
        score += W_BLOCKS
    elif openers > 0 and abs(openers - closers) <= 1:
        score += W_BLOCKS * 0.5

    # ── 3. No foreign syntax (+1.0) ──
    penalty = 0.0
    for pattern, w in [
        ('def ', 0.4), ('class ', 0.4), ('import ', 0.3),
        ('from ', 0.2), ('self.', 0.3), ('console.log', 0.4),
        ('System.out', 0.4), ('const ', 0.3), ('var ', 0.3),
        ('#include', 0.4), ('std::', 0.4), ('public ', 0.3),
    ]:
        if pattern in code:
            penalty += w
    # Curly-brace blocks (not string interpolation "{var}")
    for l in lines:
        s = l.strip()
        if s in ('{', '}') or s.endswith('{') or s.startswith('}'):
            penalty += 0.3
            break
    # Semicolons at end of statements
    for l in lines:
        s = l.strip()
        if s.endswith(';') and not s.startswith('//'):
            penalty += 0.1
    score += max(0.0, W_NO_FOREIGN - penalty)

    # ── 4. OctoFlow idioms (+1.0) ──
    idiom = 0.0
    if re.search(r'print\s*\(\s*".*\{.*\}.*"\s*\)', code):
        idiom += 0.3   # string interpolation in print
    elif 'print(' in code:
        idiom += 0.1   # at least uses print
    if 'let ' in code:
        idiom += 0.2
    if any(l.strip().startswith('use "') for l in lines):
        idiom += 0.2   # module import
    if any(l.strip().startswith('fn ') for l in lines):
        idiom += 0.15  # function definition
    if any(kw in code for kw in ['push(', 'pop(', 'len(', 'sort_array(', 'map_each(']):
        idiom += 0.15  # function-style calls (not methods)
    score += min(idiom, W_IDIOMS)

    # ── 5. Conciseness (+0.5) ──
    n = len(lines)
    if 1 <= n <= 25:
        score += W_CONCISE
    elif n <= 50:
        score += W_CONCISE * 0.5

    # ── 6. Known mistakes penalty (-0.5 max) ──
    mistakes = 0.0
    if re.search(r'print\s*\(\s*[a-zA-Z_]\w*\s*\)', code):
        mistakes += 0.15   # print(var) instead of print("{var}")
    if re.search(r'\.\s*push\s*\(', code):
        mistakes += 0.15   # arr.push() instead of push(arr, x)
    if re.search(r'\.\s*length\b', code):
        mistakes += 0.1    # arr.length instead of len(arr)
    if re.search(r'\belif\s*:', code):
        mistakes += 0.1    # elif: with colon
    if re.search(r'\bTrue\b', code):
        mistakes += 0.1    # True instead of 1.0
    if re.search(r'\bFalse\b', code):
        mistakes += 0.1    # False instead of 0.0
    score -= min(mistakes, W_NO_MISTAKES)

    return max(0.0, min(score, MAX_REWARD))


def octoflow_reward(completions, **kwargs):
    """GRPO reward function. Scores each completion."""
    rewards = []
    for c in completions:
        text = c if isinstance(c, str) else str(c)
        code = extract_flow_code(text)
        rewards.append(float(score_octoflow(code)))
    return rewards


# ── Sanity check the reward function ──
tests = [
    ("Good OctoFlow",
     'let x = 42\nlet y = x * 2\nprint("{y}")', 2.5),
    ("With function",
     'fn factorial(n)\n    if n <= 1\n        return 1\n    end\n    return n * factorial(n - 1)\nend\nlet r = factorial(5)\nprint("{r}")', 3.5),
    ("With module",
     'use "data/csv"\nlet data = read_csv("input.csv")\nlet n = len(data)\nprint("{n} rows loaded")', 3.5),
    ("Python code",
     'def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)\nprint(factorial(5))', 0.5),
    ("JavaScript",
     'const x = 42;\nconsole.log(x);', 0.0),
    ("Empty", '', 0.0),
]

print("Reward function sanity check:")
for name, code, min_expected in tests:
    s = score_octoflow(code)
    ok = "OK" if s >= min_expected else "LOW"
    print(f"  [{ok}] {name}: {s:.1f} (expected >= {min_expected})")

In [None]:
# ── Load Model with QLoRA ──
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config,
    device_map="auto", trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

lora_config = LoraConfig(
    r=LORA_R, lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none", task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print(f"\nModel loaded: {MODEL_NAME}")
print(f"GPU memory: {torch.cuda.memory_allocated()/1024**3:.1f} GB / "
      f"{torch.cuda.mem_get_info()[1]/1024**3:.1f} GB")

In [None]:
# ── GRPO Training ──
# Group Relative Policy Optimization: no separate reward model or value function.
# Advantages are computed within each group of completions for the same prompt.
# Completion that scores highest in its group gets positive advantage → reinforced.

from trl import GRPOTrainer, GRPOConfig

grpo_config = GRPOConfig(
    output_dir=OUTPUT_DIR,

    # GRPO generation
    num_generations=NUM_GENERATIONS,
    max_completion_length=MAX_COMPLETION_LEN,

    # Training
    num_train_epochs=GRPO_EPOCHS,
    per_device_train_batch_size=GRPO_BATCH,
    gradient_accumulation_steps=GRPO_GRAD_ACCUM,
    learning_rate=GRPO_LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.10,
    weight_decay=0.01,
    max_grad_norm=1.0,

    # Memory optimization
    bf16=True,
    fp16=False,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="adamw_8bit",

    # Logging
    logging_steps=5,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    report_to="none",
    seed=42,
    remove_unused_columns=False,
    dataloader_num_workers=2,
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[octoflow_reward],
    args=grpo_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)

total_steps = steps_per_epoch * GRPO_EPOCHS
print(f"Starting GRPO training:")
print(f"  {GRPO_EPOCHS} epochs × {steps_per_epoch} steps = {total_steps} total steps")
print(f"  {NUM_GENERATIONS} completions/prompt, {MAX_COMPLETION_LEN} max tokens")
print(f"  Effective batch: {GRPO_BATCH * GRPO_GRAD_ACCUM} prompts = "
      f"{GRPO_BATCH * GRPO_GRAD_ACCUM * NUM_GENERATIONS} completions/update")

trainer.train()

# Save adapter
os.makedirs(ADAPTER_DIR, exist_ok=True)
trainer.save_model(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print(f"\nGRPO adapter saved to {ADAPTER_DIR}")

del trainer
torch.cuda.empty_cache()

In [None]:
# ── Post-Training Evaluation ──
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config,
    device_map="auto", trust_remote_code=True,
)
grpo_model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
grpo_model.eval()

eval_prompts = [
    "Print hello world",
    "Read a CSV file and print the number of rows",
    "Create a function that returns the factorial of a number",
    "Sort an array of numbers in ascending order and print the result",
    "Create a simple HTTP server that responds with JSON",
    "Read lines from a file, filter lines containing error, and count them",
    "Compute the SHA-256 hash of a string and print the hex digest",
    "Generate random vectors on GPU, compute their dot product, print result",
    "Load CSV data, filter rows where value > 50, print how many remain",
    "Read stock prices, compute 20-day moving average, print the latest value",
]

total_reward = 0.0
results = []
print("Post-GRPO Evaluation")
print("=" * 60)

for prompt in eval_prompts:
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(grpo_model.device)

    with torch.no_grad():
        out = grpo_model.generate(
            **inputs, max_new_tokens=256, temperature=0.0, do_sample=False,
        )
    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

    code = extract_flow_code(gen)
    reward = score_octoflow(code)
    total_reward += reward
    results.append((prompt, reward, gen[:200]))

    status = "PASS" if reward >= 3.0 else "WEAK" if reward >= 1.5 else "FAIL"
    print(f"\n[{status} {reward:.1f}] >>> {prompt}")
    print(gen[:300])
    print("─" * 40)

avg_reward = total_reward / len(eval_prompts)
passing = sum(1 for _, r, _ in results if r >= 3.0)
print(f"\n{'=' * 60}")
print(f"Average reward: {avg_reward:.2f} / {MAX_REWARD}")
print(f"Passing (>= 3.0): {passing}/{len(eval_prompts)} ({passing/len(eval_prompts)*100:.0f}%)")
print(f"Verdict: {'PASS' if avg_reward >= 2.5 else 'NEEDS MORE TRAINING'}")

del grpo_model, base_model
torch.cuda.empty_cache()

In [None]:
# ── Merge LoRA + Convert to GGUF ──

# Merge on CPU to save GPU memory
print("Merging LoRA adapter into base model (CPU)...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True,
)
merged = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
merged = merged.merge_and_unload()

os.makedirs(MERGED_DIR, exist_ok=True)
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)
print(f"Merged model saved to {MERGED_DIR}")

del merged, base_model
torch.cuda.empty_cache()
gc.collect()

# ── Convert to GGUF ──
os.makedirs(GGUF_DIR, exist_ok=True)
LLAMA_CPP = "/kaggle/working/llama.cpp"

if not os.path.exists(LLAMA_CPP):
    !git clone --depth 1 https://github.com/ggml-org/llama.cpp {LLAMA_CPP}

!pip install -q gguf numpy sentencepiece

fp16_gguf = os.path.join(GGUF_DIR, "octo-llm-f16.gguf")
print("\nConverting to FP16 GGUF...")
!python {LLAMA_CPP}/convert_hf_to_gguf.py {MERGED_DIR} --outfile {fp16_gguf} --outtype f16
assert os.path.exists(fp16_gguf), "FP16 conversion failed"
print(f"FP16 GGUF: {os.path.getsize(fp16_gguf)/1024**3:.2f} GB")

# Build quantize tool
print("\nBuilding llama-quantize...")
!cd {LLAMA_CPP} && cmake -B build -DCMAKE_BUILD_TYPE=Release 2>&1 | tail -3
!cd {LLAMA_CPP} && cmake --build build --target llama-quantize -j$(nproc) 2>&1 | tail -5

import subprocess
result = subprocess.run(
    ["find", LLAMA_CPP + "/build", "-name", "llama-quantize", "-type", "f"],
    capture_output=True, text=True,
)
quantize_bin = result.stdout.strip().split("\n")[0]
assert quantize_bin and os.path.exists(quantize_bin), f"llama-quantize not found"
print(f"quantize binary: {quantize_bin}")

# Q5_K_M — matches base model quality (recommended for OctoFlow chat)
q5_gguf = os.path.join(GGUF_DIR, "octo-llm-v0.2-Q5_K_M.gguf")
print(f"\nQuantizing to Q5_K_M...")
!{quantize_bin} {fp16_gguf} {q5_gguf} Q5_K_M
assert os.path.exists(q5_gguf), "Q5_K_M quantization failed"

# Q4_K_M — smaller alternative
q4_gguf = os.path.join(GGUF_DIR, "octo-llm-v0.2-Q4_K_M.gguf")
print(f"Quantizing to Q4_K_M...")
!{quantize_bin} {fp16_gguf} {q4_gguf} Q4_K_M
assert os.path.exists(q4_gguf), "Q4_K_M quantization failed"

fp16_sz = os.path.getsize(fp16_gguf) / 1024**3
q5_sz = os.path.getsize(q5_gguf) / 1024**3
q4_sz = os.path.getsize(q4_gguf) / 1024**3
print(f"\nFP16:    {fp16_sz:.2f} GB")
print(f"Q5_K_M:  {q5_sz:.2f} GB ({q5_sz/fp16_sz*100:.0f}%)")
print(f"Q4_K_M:  {q4_sz:.2f} GB ({q4_sz/fp16_sz*100:.0f}%)")

In [None]:
# ── Done ──
print("=" * 60)
print("octo-llm GRPO Training Complete!")
print("=" * 60)
print(f"\nMethod: GRPO (compiler-as-reward-model)")
print(f"Base model: {MODEL_NAME}")
print(f"Prompts: {NUM_PROMPTS} ({NUM_PROMPTS - len(CHAIN_PROMPTS)} from bank + {len(CHAIN_PROMPTS)} chain)")
print(f"Generations: {NUM_GENERATIONS} per prompt")
print(f"Reward: Python OctoFlow syntax checker (max {MAX_REWARD})")
print()

for name, path in {
    "GRPO Adapter": ADAPTER_DIR,
    "Merged Model": MERGED_DIR,
    "FP16 GGUF": os.path.join(GGUF_DIR, "octo-llm-f16.gguf"),
    "Q5_K_M GGUF": os.path.join(GGUF_DIR, "octo-llm-v0.2-Q5_K_M.gguf"),
    "Q4_K_M GGUF": os.path.join(GGUF_DIR, "octo-llm-v0.2-Q4_K_M.gguf"),
}.items():
    if os.path.exists(path):
        if os.path.isdir(path):
            sz = sum(os.path.getsize(os.path.join(d, f))
                     for d, _, fns in os.walk(path) for f in fns)
        else:
            sz = os.path.getsize(path)
        print(f"  {name}: {sz/1024**2:.1f} MB")
    else:
        print(f"  {name}: MISSING")

print(f"\nNext steps:")
print(f"  1. Download octo-llm-v0.2-Q5_K_M.gguf")
print(f"  2. Copy to C:\\OctoFlow\\models\\octo-llm-v0.2.gguf")
print(f"  3. Test: octoflow chat --model models/octo-llm-v0.2.gguf")
print(f"  4. Run M9 validation: OctoFlowBench Pass@1 (base vs GRPO-trained)")