In [1]:
%pip install -q -U unsloth "datasets<4.0.0" trl


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.2/389.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported

max_seq_length = 4096

# AUTO choose QLoRA only on small GPUs
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else ""
USE_4BIT = not any(x in gpu_name for x in ["H100", "A100"])   # True on T4/L4, False on H100/A100

print("GPU:", gpu_name)
print("USE_4BIT:", USE_4BIT)
print("bf16 supported:", is_bfloat16_supported())

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = "unsloth/Qwen2.5-Coder-1.5B-Instruct",
    max_seq_length   = max_seq_length,
    dtype            = torch.bfloat16 if is_bfloat16_supported() else None,
    load_in_4bit     = USE_4BIT,
)

# IMPORTANT for stable training/inference
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    # Prefer real padding token if it exists (Qwen usually has <|fim_pad|>)
    if "<|fim_pad|>" in tokenizer.get_vocab():
        tokenizer.pad_token = "<|fim_pad|>"
    else:
        tokenizer.pad_token = tokenizer.eos_token


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
GPU: NVIDIA A100-SXM4-40GB
USE_4BIT: False
bf16 supported: True
==((====))==  Unsloth 2026.1.3: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj"
    ],
    lora_alpha = 32,
    lora_dropout = 0.05,    # small dropout helps generalization
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = True,      # usually improves stability vs base
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.3 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import numpy as np
import re

SEED = 3407
MAX_LOG_CHARS = 6000

# ---------------------------
# 1) Load ALL splits then combine, so we have more data to train on
# ---------------------------
raw = load_dataset("alexjercan/bugnet")
dataset = concatenate_datasets([raw["train"], raw["validation"], raw["test"]])

# ---------------------------
# 2) Basic filters (keep enough rows, but remove obvious junk)
# ---------------------------
dataset = dataset.filter(lambda x: x.get("language") == "Python")
dataset = dataset.filter(lambda x: isinstance(x.get("fail"), str) and isinstance(x.get("pass"), str))
dataset = dataset.filter(lambda x: len(x["fail"].strip()) > 0 and len(x["pass"].strip()) > 0)

# Keep examples where we have something useful to condition on:
# prefer stderr, otherwise fall back to error; if neither, keep only if status indicates failure.
def has_useful_trace(x):
    stderr = x.get("stderr")
    err = x.get("error")
    status = (x.get("original_status") or "").strip()
    if isinstance(stderr, str) and len(stderr) > 20:
        return True
    if isinstance(err, str) and len(err) > 5:
        return True
    # keep non-accepted failures (e.g., TLE/MLE/WA) even without stderr
    return status != "" and status != "Accepted"

dataset = dataset.filter(has_useful_trace)

# Optional: if you ONLY want real Python tracebacks, uncomment this stricter filter.
# (This will reduce data a lot. Leave OFF if you want to learn TLE/MLE fixes too.)
# dataset = dataset.filter(lambda x: isinstance(x.get("stderr"), str) and "Traceback (most recent call last):" in x["stderr"])

# ---------------------------
# 3) Relaxed consistency filter (don't destroy dataset size)
# ---------------------------
def is_consistent(example: dict) -> bool:
    fail = example.get("fail") or ""
    stderr = example.get("stderr") or ""
    # look for indented code lines shown in traceback
    code_lines = [ln.strip() for ln in stderr.splitlines() if ln.startswith("    ")]
    if not code_lines:
        return True
    failing_line = code_lines[-1].strip()
    # don't be strict on tiny / generic lines
    if len(failing_line) < 10:
        return True
    return failing_line in fail

dataset = dataset.filter(is_consistent)

# Flatten indices after many filters (helps speed later)
dataset = dataset.flatten_indices()

# ---------------------------
# 4) Split by problem_id (prevents near-duplicate leakage)
#    Use larger val/test to make pass@1 stable.
# ---------------------------
pids = dataset.unique("problem_id")
rng = np.random.default_rng(SEED)
rng.shuffle(pids)

n = len(pids)

# 10% / 10% split by PID (min 100 each if possible)
n_test = min(max(100, int(0.10 * n)), max(1, n // 2))
n_val  = min(max(100, int(0.10 * n)), max(1, (n - n_test) // 2))

test_pids  = set(pids[:n_test])
val_pids   = set(pids[n_test:n_test + n_val])
train_pids = set(pids[n_test + n_val:])

def pid_in(pidset):
    return lambda x: x["problem_id"] in pidset

ds_raw = DatasetDict({
    "train": dataset.filter(pid_in(train_pids)),
    "validation": dataset.filter(pid_in(val_pids)),
    "test": dataset.filter(pid_in(test_pids)),
})

# Flatten again after filtering (keeps indices dense)
ds_raw = DatasetDict({
    k: v.flatten_indices() for k, v in ds_raw.items()
})

print("rows:", {k: len(v) for k, v in ds_raw.items()})
print("unique problem_id:", {k: len(v.unique('problem_id')) for k, v in ds_raw.items()})

# ---------------------------
# 5) Formatter (FULL PROGRAM OUTPUT training)
#    IMPORTANT: do NOT manually add <|im_end|> tokens; the chat template handles it.
# ---------------------------
def truncate_log(log: str, max_chars: int = MAX_LOG_CHARS) -> str:
    if log is None:
        return ""
    log = str(log)
    if len(log) <= max_chars:
        return log
    return "...(truncated)...\n" + log[-max_chars:]


def build_error_trace(ex: dict) -> str:
    stderr = (ex.get("stderr") or "").strip()
    err = (ex.get("error") or "").strip()
    status = (ex.get("original_status") or "").strip()
    stdout = (ex.get("stdout") or "").strip()

    trace = stderr if stderr else err
    if not trace:
        # for TLE/MLE etc, give the signal
        trace = f"No stack trace captured. original_status={status}. stdout={stdout}"
    return truncate_log(trace, MAX_LOG_CHARS)


def format_codesensei_prompts(example):
    buggy_code = example["fail"]
    fixed_code = example["pass"]
    trace = build_error_trace(example)

    user_content = (
        "Fix the following Python program using the error trace when available.\n"
        "Return ONLY the full fixed program (no markdown, no explanation, no extra tests).\n\n"
        "### BUGGY CODE:\n"
        f"{buggy_code}\n\n"
        "### ERROR TRACE:\n"
        f"{trace}\n"
    )

    messages = [
        {"role": "system", "content": "You are a helpful coding assistant that fixes Python code from stack traces."},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": fixed_code},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": text}

ds = ds_raw.map(format_codesensei_prompts)

# ---------------------------
# 6) Sanity check: verify assistant part is PASS code
# ---------------------------
sample = ds["train"][0]["text"]
assistant = sample.split("<|im_start|>assistant", 1)[-1].split("<|im_end|>", 1)[0].strip()

print(sample[:1800])
print("\n--- assistant starts with ---\n", assistant[:400])

# If this fails (assistant not equal to pass), your tokenizer chat template is wrong.
# This should start exactly like ds_raw["train"][0]["pass"].


README.md: 0.00B [00:00, ?B/s]

bugnet.py: 0.00B [00:00, ?B/s]

The repository for alexjercan/bugnet contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/alexjercan/bugnet.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Python_train.jsonl:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Python_validation.jsonl:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Python_test.jsonl:   0%|          | 0.00/87.9k [00:00<?, ?B/s]

problem_descriptions.json:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

problem_tests.json:   0%|          | 0.00/517k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2557 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1105 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3762 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3762 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3762 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3762 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3762 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/3124 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3124 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3124 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3124 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2211 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/490 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/423 [00:00<?, ? examples/s]

rows: {'train': 2211, 'validation': 490, 'test': 423}
unique problem_id: {'train': 457, 'validation': 100, 'test': 100}


Map:   0%|          | 0/2211 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

<|im_start|>system
You are a helpful coding assistant that fixes Python code from stack traces.<|im_end|>
<|im_start|>user
Fix the following Python program using the error trace when available.
Return ONLY the full fixed program (no markdown, no explanation, no extra tests).

### BUGGY CODE:
S = input()
T = input()

maxcount = 0
for i in range(len(S) - len(T) + 1):
    count = 0
    for j in range(len(T)):
        if S[i + j] == T[j]:
            count += 1
        maxcount = max(maxcount, count)
print(len(T) - maxcount)


### ERROR TRACE:
WA
<|im_end|>
<|im_start|>assistant
S = input()
T = input()

for i in range(len(S) - len(T)):
    count = 0
    maxcount = 0
    for j in range(len(T)):
        if S[i + j] == T[j]:
            count += 1
        maxcount = max(maxcount, count)
print(len(T) - maxcount)
<|im_end|>


--- assistant starts with ---
 S = input()
T = input()

for i in range(len(S) - len(T)):
    count = 0
    maxcount = 0
    for j in range(len(T)):
        if S[i + j] == 

In [5]:
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback
from unsloth.chat_templates import train_on_responses_only

# ---- Completion-only collator (your version is fine) ----
def completion_only_collator(features):
    import torch

    batch = tokenizer.pad(
        features,
        padding=True,
        return_tensors="pt",
    )

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]

    labels = input_ids.clone()

    assistant_prefix = "<|im_start|>assistant\n"
    prefix_ids = tokenizer(assistant_prefix, add_special_tokens=False)["input_ids"]
    prefix_ids = torch.tensor(prefix_ids, device=input_ids.device)

    for i in range(input_ids.size(0)):
        seq = input_ids[i]
        start = -1
        for j in range(0, seq.size(0) - prefix_ids.size(0)):
            if torch.equal(seq[j:j+prefix_ids.size(0)], prefix_ids):
                start = j + prefix_ids.size(0)
                break
        if start == -1:
            labels[i, :] = -100
        else:
            labels[i, :start] = -100

    batch["labels"] = labels
    return batch

# H100 can handle bigger batches; auto-fallback works fine too
per_device_bs = 8 if ("H100" in gpu_name or "A100" in gpu_name) else 2
grad_accum    = 1 if ("H100" in gpu_name or "A100" in gpu_name) else 4

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds["train"].select_columns(["text"]),
    eval_dataset=ds["validation"].select_columns(["text"]),
    dataset_text_field="text",
    data_collator=completion_only_collator,
    args=SFTConfig(
        output_dir="outputs",
        max_seq_length=max_seq_length,
        packing=False,

        per_device_train_batch_size=per_device_bs,
        gradient_accumulation_steps=grad_accum,

        # IMPORTANT: smaller LR improves "don’t get worse than base"
        learning_rate=1e-4,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,

        bf16=is_bfloat16_supported(),
        fp16=not is_bfloat16_supported(),

        logging_steps=10,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        num_train_epochs=3,
        seed=SEED,
        optim="adamw_8bit" if USE_4BIT else "adamw_torch",
        max_grad_norm=1.0,
        remove_unused_columns=True,
    ),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Make trainer train ONLY on assistant responses
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user\n",
    response_part="<|im_start|>assistant\n",
)

trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/2211 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/490 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/2211 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/490 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,211 | Num Epochs = 3 | Total steps = 831
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
100,0.2455,0.304736
200,0.248,0.300811
300,0.1953,0.315039
400,0.178,0.313136


TrainOutput(global_step=400, training_loss=0.23588885843753815, metrics={'train_runtime': 309.4673, 'train_samples_per_second': 21.434, 'train_steps_per_second': 2.685, 'total_flos': 1.7062522772917248e+16, 'train_loss': 0.23588885843753815, 'epoch': 1.444043321299639})

In [6]:
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(model)

# IMPORTANT: test on a REAL dataset example (consistent fail/log/pass)
ex = ds_raw["test"][20]
buggy_code = ex["fail"]
error_log  = (ex.get("stderr") or ex.get("error") or "")
inp        = (ex.get("input") or "")

messages = [
    {"role": "system", "content": "Fix Python programs from stack traces. Output code only."},
    {"role": "user", "content":
        "Return ONLY the full fixed program.\n\n"
        f"### BUGGY CODE:\n{buggy_code}\n\n"
        f"### ERROR TRACE:\n{error_log}\n\n"
        f"### INPUT:\n{inp}\n"
    },
]

if tokenizer.pad_token is None and "<|fim_pad|>" in tokenizer.get_vocab():
    tokenizer.pad_token = "<|fim_pad|>"
tokenizer.padding_side = "left"

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

attention_mask = (input_ids != tokenizer.pad_token_id).long()

im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
eos_ids = [tokenizer.eos_token_id]
if im_end_id is not None and im_end_id not in eos_ids:
    eos_ids.append(im_end_id)

out = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=2048,   # IMPORTANT for full programs
    do_sample=False,
    temperature=0.0,
    eos_token_id=eos_ids,
    use_cache=True,
)

decoded = tokenizer.decode(out[0], skip_special_tokens=False)
assistant = decoded.split("<|im_start|>assistant", 1)[-1].split("<|im_end|>", 1)[0].strip()
print(assistant)


s = input()

if "RRR" in s:
    print(3)
elif "RR" in s:
    print(2)
elif "R" in s:
    print(1)
else:
    print(0)


In [7]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Create a folder in Drive
save_path = "/content/drive/My Drive/CodeSensei_Adapters"
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 3. Save Adapters ONLY (Safe, low RAM)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Adapters saved to {save_path}")
print("🚨 NOW: Go to 'Runtime' > 'Disconnect and Delete Runtime'.")
print("Then, refresh the page and run the cell below (Cell 7).")

Mounted at /content/drive
✅ Adapters saved to /content/drive/My Drive/CodeSensei_Adapters
🚨 NOW: Go to 'Runtime' > 'Disconnect and Delete Runtime'.
Then, refresh the page and run the cell below (Cell 7).


In [3]:
# --- RUN THIS AFTER RESTARTING RUNTIME ---\
from unsloth import FastLanguageModel
from google.colab import drive, files
import os

# 1. Setup Environment again (Since we killed the runtime)
# try:
#    import unsloth
#except ImportError:
#    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#    !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# 2. Mount Drive
drive.mount('/content/drive')
adapter_path = "/content/drive/My Drive/CodeSensei_Adapters"

# 3. Load Model + Adapters (Fresh RAM!)
# Unsloth automatically finds the base model (Qwen) referenced in your adapter config
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = adapter_path,
    max_seq_length = 8192,
    dtype = None,
    load_in_4bit = True,
)

# 4. Convert to GGUF
# This will now work because RAM is empty
print("Converting to GGUF... this may take 5-10 minutes.")
model.save_pretrained_gguf(
    "model_gguf",
    tokenizer,
    quantization_method = "q4_k_m"
)

# 5. Download the File
files_in_folder = os.listdir("model_gguf")
gguf_file = [f for f in files_in_folder if f.endswith('.gguf')][0]
full_path = os.path.join("model_gguf", gguf_file)

print(f"✅ Download ready: {full_path}")
files.download(full_path)

Mounted at /content/drive
==((====))==  Unsloth 2026.1.3: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Unsloth 2026.1.3 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Converting to GGUF... this may take 5-10 minutes.
Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:08<00:00,  8.34s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:14<00:00, 14.91s/it]


Unsloth: Merge process complete. Saved to `/content/model_gguf`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['qwen2.5-coder-1.5b-instruct.BF16.gguf

IndexError: list index out of range

In [4]:
import os
from google.colab import files

print("🔍 Searching for your GGUF file...")

# 1. Search every folder in the environment for the file
found_file = None
for root, dirs, files_list in os.walk("/content"):
    for filename in files_list:
        if filename.endswith(".gguf"):
            found_file = os.path.join(root, filename)
            break
    if found_file:
        break

# 2. Trigger the Download
if found_file:
    print(f"✅ Found file at: {found_file}")
    print(f"📦 File size: {os.path.getsize(found_file) / 1024 / 1024 / 1024:.2f} GB")
    print("🚀 Starting download... (Check your browser's download bar)")
    files.download(found_file)
else:
    print("❌ Error: No .gguf file found. The conversion step might have failed.")

🔍 Searching for your GGUF file...
✅ Found file at: /content/qwen2.5-coder-1.5b-instruct.Q4_K_M.gguf
📦 File size: 0.92 GB
🚀 Starting download... (Check your browser's download bar)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>