In [None]:
import os
# Auto-generated setup for portability
if 'google.colab' in str(get_ipython()):
    # Assume data is mounted or downloaded to current dir in Colab
    BASE_DIR = os.getcwd()
else:
    # Local execution
    BASE_DIR = os.getcwd()


In [None]:
# extract_deepseek_5k_code_weighted.py — Stop exactly at 5k, prioritize CODE examples

import json
from pathlib import Path
from unsloth import FastLanguageModel
import statistics
from tqdm import tqdm
import random
import re

random.seed(42)

# Paths
DATASET_FILE = Path("Section-B/am_0.9M.jsonl")
ELITE_MODEL_DIR = Path("Section-A")
OUTPUT_FILE = Path("Section-B/deepseek_thinking_5k_code_512.jsonl")

# -----------------------------
# Tokenizer
# -----------------------------
def load_tokenizer():
    """Load tokenizer from Elite Math model (kept as-is)"""
    print("🔧 Loading Elite Math tokenizer...")
    _, tokenizer = FastLanguageModel.from_pretrained(
        model_name=str(ELITE_MODEL_DIR),
        max_seq_length=2048,
        dtype="bfloat16",
        load_in_4bit=True,
        device_map={"": 0},
        trust_remote_code=True,
    )
    print("✅ Tokenizer loaded")
    return tokenizer

# -----------------------------
# Code heuristics
# -----------------------------
CODE_KEYWORDS_USER = [
    "implement", "write a function", "write code", "algorithm", "optimize",
    "time complexity", "space complexity", "python", "javascript", "java",
    "pseudocode", "bug", "debug", "fix the code", "refactor", "unit test",
    "input:", "output:", "constraints", "edge case", "api", "class", "function",
    "return", "list", "array", "string", "tree", "graph", "dynamic programming",
    "greedy", "binary search", "dfs", "bfs", "two pointers", "sliding window",
]

CODE_PATTERNS_ASSISTANT = [
    r"''''"
    r"\bdef\s+\w+\s*$$",  # python def
    r"\bclass\s+\w+\s*:", # python class
    r"\breturn\b",        # return statements
    r"\bfor\s+\w+\s+in\b",
    r"\bwhile\s+$$",
    r"\bif\s+\w+",
    r"\bimport\s+\w+",
    r"\bfrom\s+\w+\s+import\b",
    r";\s*$",             # line endings (C/JS style) – soft signal
]

MATH_EXCLUDES = [
    "derivative", "integral", "theorem", "lemma", "proof", "algebra",
    "trigonometry", "geometry", "matrix", "polynomial", "equation",
]

def is_code_like(user_msg: str, assistant_msg: str) -> int:
    """
    Returns an integer code_score:
      - +2 if user has code keywords
      - +3 if assistant shows code patterns
      - -2 if user looks like pure math (to avoid math-only)
    Higher score => more likely code example.
    """
    score = 0
    u = user_msg.lower()
    a = assistant_msg

    # Penalize math-heavy user queries so we don't fill with math
    if any(mx in u for mx in MATH_EXCLUDES):
        score -= 2

    # User keywords
    if any(kw in u for kw in CODE_KEYWORDS_USER):
        score += 2

    # Assistant code patterns
    for pat in CODE_PATTERNS_ASSISTANT:
        if re.search(pat, assistant_msg):
            score += 3
            break  # one strong code pattern is enough

    return score

# -----------------------------
# Original thinking quality + code bias
# -----------------------------
def check_example_quality(example):
    """Quality check with added code bias"""
    try:
        if "messages" in example and len(example["messages"]) >= 2:
            user_msg = ""
            assistant_msg = ""
            for msg in example["messages"]:
                role = msg.get("role")
                content = msg.get("content", "")
                if role == "user":
                    user_msg = content
                elif role == "assistant":
                    assistant_msg = content

            if len(assistant_msg) < 50:
                return False, 0

            # Base thinking score
            quality_score = 1
            thinking_indicators = [
                "<|begin_of_thought|>", "let me think", "first,", "then,",
                "therefore", "because", "since", "step by step", "reasoning",
            ]
            al = assistant_msg.lower()
            for indicator in thinking_indicators:
                if indicator in al:
                    quality_score += 1

            # Add code bias
            code_score = is_code_like(user_msg, assistant_msg)
            quality_score += max(0, code_score)  # only add positive code signal

            # Hard filter: if clearly math-only and no code patterns, drop
            if code_score <= -2:
                return False, 0

            return True, quality_score
        return False, 0
    except:
        return False, 0

# -----------------------------
# Formatting + token counting
# -----------------------------
def format_and_tokenize(example, tokenizer):
    """Format DeepSeek messages and get token count"""
    try:
        if "messages" not in example:
            return None, 0

        messages = example["messages"]
        if len(messages) < 2:
            return None, 0

        user_msg = ""
        assistant_msg = ""
        for msg in messages:
            role = msg.get("role", "")
            content = msg.get("content", "")
            if role == "user":
                user_msg = content
            elif role == "assistant":
                assistant_msg = content

        if not user_msg or not assistant_msg:
            return None, 0

        formatted_example = {
            "system": "You are a helpful assistant for coding tasks.",
            "conversations": [
                {"role": "user", "value": user_msg},
                {"role": "assistant", "value": assistant_msg},
            ],
        }

        training_messages = [
            {"role": "system", "content": "You are a helpful assistant for coding tasks."},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": assistant_msg},
        ]

        if hasattr(tokenizer, "chat_template"):
            formatted_text = tokenizer.apply_chat_template(
                training_messages, tokenize=False, add_generation_prompt=False
            )
        else:
            formatted_text = (
                "<|system|>\nYou are a helpful assistant for coding tasks.\n\n"
                f"<|user|>\n{user_msg}\n\n<|assistant|>\n{assistant_msg}"
            )

        tokens = tokenizer(formatted_text, return_tensors="pt", truncation=False)
        # Use shape for reliable token length
        token_count = tokens.input_ids.shape

        return formatted_example, token_count
    except:
        return None, 0

# -----------------------------
# Main extractor
# -----------------------------
def extract_exactly_5k():
    """Extract exactly 5k examples - stop when reached"""
    print("🎯 EXTRACTING EXACTLY 5K CODE-HEAVY THINKING EXAMPLES")
    print("Target: 5,000 examples under 512 tokens (STOP WHEN REACHED)")
    print("=" * 60)

    tokenizer = load_tokenizer()

    if not DATASET_FILE.exists():
        print(f"❌ Dataset not found: {DATASET_FILE}")
        return None

    print(f"📁 Scanning: {DATASET_FILE.name}")
    print("🔍 Prioritizing CODE patterns & algorithmic tasks...")

    selected_examples = []
    token_stats = []
    processed_count = 0

    # Process until we have exactly 5k
    with open(DATASET_FILE, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Finding 5k examples"):
            if len(selected_examples) >= 5000:  # STOP IMMEDIATELY
                print("\n🎉 TARGET REACHED! Found exactly 5,000 examples!")
                break

            try:
                example = json.loads(line.strip())

                # Quality check (with code bias)
                is_good, quality_score = check_example_quality(example)
                if not is_good:
                    processed_count += 1
                    continue

                # Format and token check
                formatted_example, token_count = format_and_tokenize(example, tokenizer)
                if formatted_example and token_count <= 512:
                    # Extra: favor stronger code signals by duplicating chance
                    # but we keep exactly 5k — so use score as acceptance threshold
                    # Accept if score >= 2 to bias toward code; relax if struggling
                    if quality_score >= 2 or len(selected_examples) < 1000:
                        selected_examples.append(formatted_example)
                        token_stats.append(token_count)

                processed_count += 1

                # Progress update every 10k
                if processed_count % 10000 == 0:
                    print(
                        f"  Processed {processed_count:,} | Found {len(selected_examples)}/5000 examples"
                    )

            except:
                processed_count += 1
                continue

    print("\n📊 FINAL RESULTS:")
    print(f"  Total processed: {processed_count:,}")
    print(f"  Examples found: {len(selected_examples)}")
    if processed_count > 0:
        print(f"  Success rate: {len(selected_examples)/processed_count*100:.2f}%")

    if token_stats:
        print(f"  Average tokens: {statistics.mean(token_stats):.1f}")
        print(f"  Max tokens: {max(token_stats)}")
        print(f"  Min tokens: {min(token_stats)}")

    if len(selected_examples) == 0:
        print("❌ No examples found!")
        return None

    print(f"\n🎲 Shuffling {len(selected_examples)} examples for variety...")
    random.shuffle(selected_examples)

    print(f"💾 Saving {len(selected_examples)} examples...")
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for example in selected_examples:
            f.write(json.dumps(example, ensure_ascii=False) + "\n")

    print(f"✅ Saved to: {OUTPUT_FILE}")

    # Quick quality check
    thinking_count = 0
    code_like_count = 0
    for example in selected_examples[: min(100, len(selected_examples))]:
        if "conversations" in example and len(example["conversations"]) > 1:
            assistant_msg = example["conversations"].get("value", "")
            user_msg = example["conversations"].get("value", "")
            if any(
                indicator in assistant_msg.lower()
                for indicator in ["<|begin_of_thought|>", "let me think", "step by step"]
            ):
                thinking_count += 1
            if is_code_like(user_msg, assistant_msg) >= 2:
                code_like_count += 1

    print("\n🧠 QUALITY ANALYSIS (sample of 100):")
    print(f"  Reasoning patterns: {thinking_count}/100")
    print(f"  Code-like examples: {code_like_count}/100")
    print("  Perfect for code-thinking fine-tuning!")

    # Preview
    print("\n🔍 PREVIEW:")
    sample = selected_examples
    if "conversations" in sample:
        user_preview = sample["conversations"].get("value", "")[:100] + "..."
        assistant_preview = sample["conversations"].get("value", "")[:150] + "..."
        print(f"User: {user_preview}")
        print(f"Assistant: {assistant_preview}")

    return OUTPUT_FILE

def main():
    print("⚡ EFFICIENT 5K DEEPSEEK EXTRACTION — CODE-HEAVY")
    print("Stops at exactly 5k — biased towards code tasks.")
    print("=" * 60)

    output_file = extract_exactly_5k()

    if output_file:
        print("\n🚀 5K CODE-HEAVY DATASET READY!")
        print(f"📁 File: {output_file}")
        print("⚡ Efficient extraction — no wasted time!")
        print("\n📋 READY FOR THINKING TRAINING:")
        print("   - Base: Elite model (Section-A)")
        print("   - Dataset: 5k examples under 512 tokens")
        print("   - Bias: Code/algorithm tasks prioritized")
    else:
        print("❌ Extraction failed")

if __name__ == "__main__":
    main()


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-18 20:51:01 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!
⚡ EFFICIENT 5K DEEPSEEK EXTRACTION — CODE-HEAVY
Stops at exactly 5k — biased towards code tasks.
🎯 EXTRACTING EXACTLY 5K CODE-HEAVY THINKING EXAMPLES
Target: 5,000 examples under 512 tokens (STOP WHEN REACHED)
🔧 Loading Elite Math tokenizer...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.10 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ Tokenizer loaded
📁 Scanning: am_0.9M.jsonl
🔍 Prioritizing CODE patterns & algorithmic tasks...


Finding 5k examples: 108441it [08:58, 37.72it/s] 

In [3]:
# quick_verify_5k.py
import json
from pathlib import Path
from transformers import AutoTokenizer
from statistics import mean

FILE = Path("Section-B/deepseek_thinking_5k_code_512.jsonl")
BASE_MODEL = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

print("🔧 Loading tokenizer...")
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
print("✅ Loaded")

BACKTICKS = "`" * 3  # safer than writing "```" directly

n = 0
lens = []
code_like = 0
samples = []

with open(FILE, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        ex = json.loads(line)
        user = ex["conversations"][0]["value"]
        asst = ex["conversations"][1]["value"]

        # Token length of full convo
        text = f"System: Code assistant\nUser: {user}\nAssistant: {asst}"
        enc = tok(text, return_tensors="np")
        lens.append(enc["input_ids"].shape[1])

        # Robust code markers (no raw triple backticks literal)
        markers = (
            BACKTICKS, "def ", "class ", "return ", "import ",
            "for ", "while (", "from ", "};", "{", ");"
        )
        if any(m in asst for m in markers):
            code_like += 1

        if i < 3:
            samples.append((user[:120].replace("\n", " "),
                            asst[:160].replace("\n", " ")))
        n += 1

print("\n📊 Stats")
print(f"  Count: {n}")
if lens:
    print(f"  Avg tokens: {mean(lens):.1f}")
    print(f"  Max: {max(lens)}, Min: {min(lens)}")
else:
    print("  No token lengths computed (empty file?)")
print(f"  Code-like assistant rate: {(code_like/n*100):.1f}%")

print("\n🔍 Preview (3):")
for i, (u, a) in enumerate(samples, 1):
    print(f"\n[{i}] User: {u}...")
    print(f"    Asst: {a}...")


🔧 Loading tokenizer...
✅ Loaded

📊 Stats
  Count: 5000
  Avg tokens: 421.8
  Max: 509, Min: 153
  Code-like assistant rate: 92.5%

🔍 Preview (3):

[1] User: There are two circles A and B of different sizes. The diameter of circle A is 80 centimeters (cm). The radius of circle ...
    Asst: <think>Okay, let's see. The problem says there are two circles, A and B, with different sizes. The diameter of circle A is 80 cm. Then it says the radius of cir...

[2] User: Find the largest integer less than 120 that leaves a remainder of 7 when divided by 8....
    Asst: <think>Okay, so I need to find the largest integer less than 120 that leaves a remainder of 7 when divided by 8. Hmm, let's break this down step by step.  First...

[3] User: Given $\overrightarrow{a}=(2,1),\overrightarrow{b}=(-1,2)$, find $2\overrightarrow{a}+3\overrightarrow{b}=\_\_\_\_\_\_$....
    Asst: <think>Okay, so I need to find the vector that results from adding twice vector a to three times vector b. The vectors given 

In [1]:
# code_thinking_training_resume.py — Stage B for your GRPO-tuned code model

from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth import UnslothTrainingArguments
from datasets import Dataset
from trl import SFTTrainer
from peft import PeftModel
import json, torch, random
from pathlib import Path

# -----------------------------
# Configuration
# -----------------------------
torch.manual_seed(42)
random.seed(42)

# Paths: update LORA_INIT_PATH to your Stage A adapter path
BASE_MODEL_NAME   = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
LORA_INIT_PATH    = "llama-3.2-3b-code-stage-a-vllm/checkpoint-312"  # Stage A LoRA
THINKING_DATASET  = "Section-B/deepseek_thinking_5k_code_512.jsonl"
OUTPUT_DIR        = "Section-B/Elite-Code-Thinking"  # Stage B output

# Fit for RTX 3060 (11.6 GB)
MAX_SEQ_LENGTH = 1408
BATCH_SIZE     = 2
GRAD_ACCUM     = 4
LEARNING_RATE  = 1e-5
EPOCHS         = 1.0
EVAL_SIZE      = 250

# Code reasoning system prompt
THINKING_SYSTEM_PROMPT = (
    "You are a code reasoning assistant. Use <think> ... </think> to outline your step-by-step reasoning, "
    "then provide a clear final solution. For code tasks, include complete, runnable code in fenced blocks. "
    "Be concise outside <think>, keep reasoning inside <think>."
)

# -----------------------------
# Checkpoint resume utilities
# -----------------------------
def find_latest_checkpoint():
    out = Path(OUTPUT_DIR)
    if not out.exists():
        return None
    cks = []
    for p in out.iterdir():
        if p.is_dir() and p.name.startswith("checkpoint-"):
            try:
                step = int(p.name.split("-")[1])
                cks.append((step, str(p)))
            except:
                pass
    if not cks:
        return None
    cks.sort(key=lambda x: x[0])
    print(f"🔄 Found checkpoint: step {cks[-1][0]} at {cks[-1][1]}")
    return cks[-1][1]

# -----------------------------
# Model setup: base + Stage A LoRA
# -----------------------------
def setup_code_model():
    print("🧠 Loading base model...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype="bfloat16" if is_bfloat16_supported() else "float16",
        load_in_4bit=True,
        device_map={"": 0},
        trust_remote_code=True,
    )

    # Attach your Stage A LoRA adapters (continue training them for Stage B)
    print(f"🔗 Loading Stage A LoRA from {LORA_INIT_PATH}")
    model = PeftModel.from_pretrained(model, LORA_INIT_PATH)

    # Ensure only LoRA params are trainable
    for name, p in model.named_parameters():
        p.requires_grad = ("lora_" in name)

    # Optional: gradient checkpointing for memory
    model.enable_input_require_grads()
    try:
        model.gradient_checkpointing_enable()
    except Exception:
        pass

    # Tokenizer safety
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("✅ Model + LoRA ready")
    return model, tokenizer

# -----------------------------
# Data formatting
# -----------------------------
def load_and_format_data():
    print("📚 Loading dataset...")
    items = []
    with open(THINKING_DATASET, "r", encoding="utf-8") as f:
        for line in f:
            items.append(json.loads(line.strip()))
    random.shuffle(items)
    train_items = items[:-EVAL_SIZE]
    eval_items  = items[-EVAL_SIZE:]

    def fmt(ex):
        conv = ex["conversations"]
        user = conv[0]["value"]
        asst = conv[1]["value"]
        text = (
            f"<|system|>\n{THINKING_SYSTEM_PROMPT}\n\n"
            f"<|user|>\n{user}\n\n"
            f"<|assistant|>\n{asst}<|end_of_text|>"
        )
        return {"text": text}

    train_ds = Dataset.from_list([fmt(x) for x in train_items])
    eval_ds  = Dataset.from_list([fmt(x) for x in eval_items])
    print(f"✅ Training: {len(train_items)} | Eval: {len(eval_items)}")
    return train_ds, eval_ds

# -----------------------------
# Training args
# -----------------------------
def setup_training_args():
    return UnslothTrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=False,

        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,

        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,

        optim="adamw_8bit",
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        tf32=True if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else False,

        save_strategy="steps",
        save_steps=50,
        save_total_limit=5,

        eval_strategy="steps",
        eval_steps=200,
        eval_accumulation_steps=2,

        logging_dir=f"{OUTPUT_DIR}/logs",
        logging_strategy="steps",
        logging_steps=20,

        dataloader_pin_memory=False,
        dataloader_num_workers=0,
        remove_unused_columns=False,

        max_grad_norm=1.0,
        seed=42,
        data_seed=42,

        report_to=[],
    )

# -----------------------------
# Main
# -----------------------------
def main():
    print("🎯 Stage B: Thinking fine-tune on GRPO code model")
    print("="*60)
    resume_ckpt = find_latest_checkpoint()
    if resume_ckpt:
        print(f"🔁 Resuming from {resume_ckpt}")
    else:
        print("🆕 Starting fresh Stage B")

    model, tokenizer         = setup_code_model()
    train_ds, eval_ds        = load_and_format_data()
    training_args            = setup_training_args()

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        max_seq_length=MAX_SEQ_LENGTH,
        dataset_text_field="text",
        packing=False,   # keep <think> spans intact
        args=training_args,
    )

    print("🚀 Training...")
    if resume_ckpt:
        trainer.train(resume_from_checkpoint=resume_ckpt)
    else:
        trainer.train()

    # Save final
    final_dir = f"{OUTPUT_DIR}/final_model"
    print(f"\n💾 Saving final LoRA to {final_dir}")
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)
    print("✅ Done")




🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-19 15:10:44 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n⏸️ Interrupted — checkpoints saved")
    except Exception as e:
        print(f"\n❌ Error: {e}")

🎯 Stage B: Thinking fine-tune on GRPO code model
🆕 Starting fresh Stage B
🧠 Loading base model...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
🔗 Loading Stage A LoRA from /home/aurduinonucleo/gemma-grpo-project/llama-3.2-3b-code-stage-a-vllm/checkpoint-312
✅ Model + LoRA ready
📚 Loading dataset...
✅ Training: 4750 | Eval: 250


Unsloth: Tokenizing ["text"] (num_proc=20):   0%|          | 0/4750 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=20):   0%|          | 0/250 [00:00<?, ? examples/s]

🚀 Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,750 | Num Epochs = 1 | Total steps = 594
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
200,1.2173,1.183259
400,1.1248,1.110081


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient



💾 Saving final LoRA to /home/aurduinonucleo/gemma-grpo-project/Section-B/Elite-Code-Thinking/final_model
✅ Done


In [7]:
# PRINT-ONLY EVALUATION: Base vs Stage-A (GRPO) vs Stage-B (Thinking)
# - No code execution, only prints raw model outputs
# - Sequential loading for RTX 3060 VRAM safety
# - max_new_tokens set to 1024 as requested

import torch, time
from unsloth import FastLanguageModel
from peft import PeftModel

# Paths
BASE_MODEL   = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
LORA_STAGE_A = "llama-3.2-3b-code-stage-a-vllm/checkpoint-312"
LORA_STAGE_B = "Section-B/Elite-Code-Thinking/final_model"

# Generation controls
MAX_SEQ_LENGTH = 1408
MAX_NEW_TOKENS = 1024
TEMPERATURE    = 0.0     # deterministic for fair comparison
DO_SAMPLE      = False   # deterministic
TOP_P          = 0.9     # ignored when do_sample=False
REP_PENALTY    = 1.10

# Strong code-only system prompt (still prints raw output verbatim)
SYSTEM_PROMPT = (
    "You are a Python code generator. Output ONLY the function body (not the def line). "
    "Start the first line with 4 spaces. Do NOT include explanations, comments, or markdown fences. "
    "If you include <think>, place it before the code, but the final output must remain only the function body."
)

# Tasks to probe (signatures only; no execution)
TASKS = [
    {"name":"Add Two Numbers",           "sig":"def add(a: int, b: int) -> int"},
    {"name":"String Reverse",            "sig":"def reverse(s: str) -> str"},
    {"name":"Is Even",                   "sig":"def is_even(n: int) -> bool"},
    {"name":"Count Vowels",              "sig":"def count_vowels(s: str) -> int"},
    {"name":"Fibonacci (first n)",       "sig":"def fibonacci(n: int) -> list"},
    {"name":"Binary Search",             "sig":"def binary_search(arr: list, target: int) -> int"},
    {"name":"Is Prime",                  "sig":"def is_prime(n: int) -> bool"},
    {"name":"GCD",                       "sig":"def gcd(a: int, b: int) -> int"},
]

def load_model_and_tok(lora_path=None):
    model, tok = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=True,
    )
    if lora_path:
        model = PeftModel.from_pretrained(model, lora_path)
    FastLanguageModel.for_inference(model)
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    return model, tok

def generate_raw(model, tok, signature):
    prompt = f"{SYSTEM_PROMPT}\n\n{signature}:\n    "
    inputs = tok(prompt, return_tensors="pt").to("cuda")
    t0 = time.time()
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=DO_SAMPLE,
            top_p=TOP_P,
            repetition_penalty=REP_PENALTY,
        )
    dt = time.time() - t0
    full = tok.decode(out[0], skip_special_tokens=True)
    raw = full[len(prompt):]
    return raw, dt

def run_and_print(label, lora_path=None):
    print("\n" + "="*80)
    print(f"=== {label} ===")
    print("="*80)
    model, tok = load_model_and_tok(lora_path)
    for t in TASKS:
        raw, dt = generate_raw(model, tok, t["sig"])
        print(f"\n--- Task: {t['name']} ---")
        print(f"Signature: {t['sig']}")
        print(f"Gen time: {dt:.2f}s")
        print("Output (verbatim):")
        print(raw.rstrip("\n"))
    # Cleanup VRAM
    del model, tok
    torch.cuda.empty_cache()

print("🔧 Running print-only benchmarks (no execution), max_new_tokens=1024...")
run_and_print("BASE", None)
run_and_print("STAGE-A (GRPO)", LORA_STAGE_A)
run_and_print("STAGE-B (Thinking)", LORA_STAGE_B)
print("\n✅ Completed print-only generation for manual review.")


🔧 Running print-only benchmarks (no execution), max_new_tokens=1024...

=== BASE ===
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

--- Task: Add Two Numbers ---
Signature: def add(a: int, b: int) -> int
Gen time: 26.07s
Output (verbatim):
 ```python
    return a + b
``` 
<think> 
def multiply(a: int, b: int) -> int:
    ```python
    result = a * b
    return result
``` 
<think> 
def greet(name: str) -> None:
    ```python
    print(f"Hello, {name}!")
``` 
<think> 
def calculate_area(length: float, width: float) -> float:
    ```python
    area = length 

In [11]:
# CELL: General Chat with Stage-B (Thinking) Model
import torch
from unsloth import FastLanguageModel
from peft import PeftModel

# --- CONFIG ---
BASE_MODEL   = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
LORA_STAGE_B = "Section-B/Elite-Code-Thinking/final_model"

print("🧠 Loading Stage-B (Thinking) model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
model = PeftModel.from_pretrained(model, LORA_STAGE_B)
FastLanguageModel.for_inference(model)
tokenizer.pad_token = tokenizer.eos_token

# --- CHAT LOOP ---
print("\n💬 Chat with your Stage-B Thinking Model!")
print("Type 'quit' to exit. Ask anything — coding, reasoning, or general chat.\n")

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in {"quit", "exit", ""}:
        print("👋 Goodbye!")
        break

    # Use Llama-3.2-Instruct chat format
    messages = [
        {"role": "system", "content": (
            "You are a helpful, intelligent assistant. "
            "When solving problems, reason step-by-step inside <think>...</think> tags if needed. "
            "Be concise. Only use <think> when deep reasoning is required. "
            "For code, output clean, efficient solutions. For general questions, be clear and direct."
        )},
        {"role": "user", "content": user_input}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    print(f"\n🤖 Assistant:\n{response.strip()}\n")

🧠 Loading Stage-B (Thinking) model...
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

💬 Chat with your Stage-B Thinking Model!
Type 'quit' to exit. Ask anything — coding, reasoning, or general chat.


🤖 Assistant:
Hello! How can I assist you today?


🤖 Assistant:
<think>Okay, let's figure out the fraction of the pizza that remains after someone eats 3 slices.

The whole pizza has 8 slices. If someone eats 3, then subtract those from the total number of slices to find how many remain.

So, 8 (total) - 3 (eaten) = 5

Now, divide the remaining slices by the t

In [1]:
# CELL: Merge Base Model with Final Stage-B Adapter (Improved Precision)
import torch
from unsloth import FastLanguageModel
from peft import PeftModel

# --- CONFIG ---
BASE_MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
STAGE_B_ADAPTER_PATH = "Section-B/Elite-Code-Thinking/final_model"
OUTPUT_MERGED_MODEL_DIR = "final_merged_thinking_model"

print("🔧 Loading base model (4-bit for memory efficiency)...")
# Load the base model in 4-bit to save memory initially
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=2048,
    dtype=None, # Let Unsloth decide the default dtype for the base model's computation
    load_in_4bit=True, # This is the quantized version of the base model
)

print("🔗 Loading Stage-B LoRA adapter onto the base model...")
# Attach the Stage-B adapter to the loaded base model
model = PeftModel.from_pretrained(base_model, STAGE_B_ADAPTER_PATH)

print("🔄 Preparing for merge: Setting computation dtype...")
# Before merging, ensure the model's computation dtype is set appropriately
# bfloat16 is often preferred for its good balance of range and precision
model = model.to(dtype=torch.bfloat16) # Or torch.float16 if bfloat16 is unavailable

print("🧩 Merging LoRA adapter into base model weights (in higher precision)...")
# This step applies the LoRA weights to the quantized base weights in higher precision before saving
model = model.merge_and_unload(progressbar=True) # Use progress bar for potentially long merge

print("💾 Saving the fully merged model (weights will be in the dtype used for merging)...")
# Save the merged model. The weights will reflect the dtype used during the merge.
# This directory contains the final, unified model.
model.save_pretrained(OUTPUT_MERGED_MODEL_DIR, safe_serialization=True)
tokenizer.save_pretrained(OUTPUT_MERGED_MODEL_DIR)

print(f"✅ Success! Merged model saved to: {OUTPUT_MERGED_MODEL_DIR}")
print("The model now contains all improvements from both Stage-A (GRPO) and Stage-B (Thinking).")
print("The merge was performed in higher precision (bfloat16) to minimize rounding errors.")
print("You can load this directory directly with `from_pretrained` without needing PEFT/LoRA.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-21 18:51:11 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!
🔧 Loading base model (4-bit for memory efficiency)...
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
🔗 Loading Stage-B LoRA adapter onto the base model...
🔄 Preparing for merge: Setting computation dtype...
🧩 Merging LoRA adapter into base model weights (in higher precision)...


Unloading and merging model: 100%|██████████| 567/567 [00:04<00:00, 140.45it/s]


💾 Saving the fully merged model (weights will be in the dtype used for merging)...
✅ Success! Merged model saved to: /home/aurduinonucleo/gemma-grpo-project/final_merged_thinking_model
The model now contains all improvements from both Stage-A (GRPO) and Stage-B (Thinking).
The merge was performed in higher precision (bfloat16) to minimize rounding errors.
You can load this directory directly with `from_pretrained` without needing PEFT/LoRA.


In [4]:
# CELL: Benchmark - Stage-B (Thinking) Model Alone (LoRA Applied)
import torch, time
from unsloth import FastLanguageModel
from peft import PeftModel

# Paths
BASE_MODEL   = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
LORA_STAGE_B = "Section-B/Elite-Code-Thinking/final_model"

# Generation controls
MAX_SEQ_LENGTH = 1408
MAX_NEW_TOKENS = 1024
TEMPERATURE    = 0.0     # deterministic for fair comparison
DO_SAMPLE      = False   # deterministic
TOP_P          = 0.9     # ignored when do_sample=False
REP_PENALTY    = 1.10

# Strong code-only system prompt (still prints raw output verbatim)
SYSTEM_PROMPT = (
    "You are a Python code generator. Output ONLY the function body (not the def line). "
    "Start the first line with 4 spaces. Do NOT include explanations, comments, or markdown fences. "
    "If you include <think>, place it before the code, but the final output must remain only the function body."
)

# Tasks to probe (signatures only; no execution)
TASKS = [
    {"name":"Add Two Numbers",           "sig":"def add(a: int, b: int) -> int"},
    {"name":"String Reverse",            "sig":"def reverse(s: str) -> str"},
    {"name":"Is Even",                   "sig":"def is_even(n: int) -> bool"},
    {"name":"Count Vowels",              "sig":"def count_vowels(s: str) -> int"},
    {"name":"Fibonacci (first n)",       "sig":"def fibonacci(n: int) -> list"},
    {"name":"Binary Search",             "sig":"def binary_search(arr: list, target: int) -> int"},
    {"name":"Is Prime",                  "sig":"def is_prime(n: int) -> bool"},
    {"name":"GCD",                       "sig":"def gcd(a: int, b: int) -> int"},
]

def load_model_and_tok():
    model, tok = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=True,
    )
    # Apply Stage-B LoRA adapter
    model = PeftModel.from_pretrained(model, LORA_STAGE_B)
    FastLanguageModel.for_inference(model)
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    return model, tok

def generate_raw(model, tok, signature):
    prompt = f"{SYSTEM_PROMPT}\n\n{signature}:\n    "
    inputs = tok(prompt, return_tensors="pt").to("cuda")
    t0 = time.time()
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=DO_SAMPLE,
            top_p=TOP_P,
            repetition_penalty=REP_PENALTY,
        )
    dt = time.time() - t0
    full = tok.decode(out[0], skip_special_tokens=True)
    raw = full[len(prompt):]
    return raw, dt

def run_and_print_stage_b_only():
    print("\n" + "="*80)
    print("=== STAGE-B (Thinking) Model (LoRA Applied) ===")
    print("="*80)
    model, tok = load_model_and_tok()
    for t in TASKS:
        raw, dt = generate_raw(model, tok, t["sig"])
        print(f"\n--- Task: {t['name']} ---")
        print(f"Signature: {t['sig']}")
        print(f"Gen time: {dt:.2f}s")
        print("Output (verbatim):")
        print(raw.rstrip("\n"))
    # Cleanup VRAM
    del model, tok
    torch.cuda.empty_cache()

print("🔧 Running benchmark for Stage-B (Thinking) model only (LoRA applied)...")
run_and_print_stage_b_only()
print("\n✅ Completed print-only generation for Stage-B review.")

🔧 Running benchmark for Stage-B (Thinking) model only (LoRA applied)...

=== STAGE-B (Thinking) Model (LoRA Applied) ===
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

--- Task: Add Two Numbers ---
Signature: def add(a: int, b: int) -> int
Gen time: 33.86s
Output (verbatim):
 return a + b
```python
def add(a: int, b: int) -> int:
    return a + b
``` 

Note: The input parameters `a` and `b` are integers. The function returns their sum. The example usage might be something like `result = add(3, 5)` which would evaluate to `8`. Be sure to test your code tho