In [1]:
from load_llms import run_comparison, quick_test

In [2]:
#!/usr/bin/env python3
"""
Pull all 4-bit models from Hugging Face org `unsloth`.

Requirements:
  pip install huggingface_hub unsloth accelerate transformers bitsandbytes tqdm

Notes:
- "4-bit" models are matched by common naming/tag patterns (e.g., 'bnb-4bit', 'gptq-4bit', '-4bit').
- By default we only *download* the models to the local HF cache.
- Set VERIFY_WITH_UNSLOTH=True to do a lightweight CPU-only load check via Unsloth.
"""

from huggingface_hub import list_models, snapshot_download, HfApi
from tqdm import tqdm
import os
import re

# Optional: set your HF token via env var for private models
HF_TOKEN = os.environ.get("HF_TOKEN", None)

# Discovery parameters
ORG = "unsloth"               # HF organization
LIB_TAGS = {"unsloth"}        # prefer models tagged with library=unsloth (best-effort)
FOURBIT_PATTERNS = [
    r"\bbnb[-_]4bit\b",
    r"\bgptq[-_ ]?4bit\b",
    r"\b4bit\b",
    r"-4bit\b",
]
EXCLUDE_PATTERNS = [
    r"\b8bit\b",              # avoid accidental 8-bit matches
    r"gguf",                  # often quantized for llama.cpp, not Unsloth/Transformers
]

# Download parameters
REVISION = None               # pin a revision/commit if you need reproducibility
LOCAL_DIR = None              # None -> default HF cache; else path to store snapshots
ALLOW_PATTERNS = None         # e.g., ["*.safetensors", "config.json"] to limit files
IGNORE_PATTERNS = None

# Optional verification: try loading configs with Unsloth (CPU only to avoid GPU OOM)
VERIFY_WITH_UNSLOTH = False   # set True if you want to sanity-check loading on CPU

def is_4bit_model(card) -> bool:
    mid = card.modelId.lower()
    # quick exclude
    if any(re.search(p, mid) for p in EXCLUDE_PATTERNS):
        return False

    # positive matches in id
    if any(re.search(p, mid) for p in FOURBIT_PATTERNS):
        return True

    # also check tags/fields if available
    tags = set((card.tags or [])) | set((card.library_name or '').split())
    text = " ".join([mid] + list(tags))
    if any(re.search(p, text) for p in FOURBIT_PATTERNS) and not any(
        re.search(p, text) for p in EXCLUDE_PATTERNS
    ):
        return True

    return False

def prefer_unsloth_tag(card) -> bool:
    tags = set(card.tags or [])
    return len(LIB_TAGS & {t.lower() for t in tags}) > 0 or (card.library_name or "").lower() == "unsloth"

def discover_unsloth_4bit_models():
    api = HfApi()
    # Broad search under the org; we‚Äôll locally filter for 4-bit
    candidates = list_models(author=ORG, fetch_config=True, token=HF_TOKEN)
    fourbit = [c for c in candidates if is_4bit_model(c)]

    # Prefer models explicitly tagged with unsloth first
    fourbit.sort(key=lambda c: (not prefer_unsloth_tag(c), c.modelId.lower()))
    return fourbit

def pull_models(models):
    downloaded = []
    for card in tqdm(models, desc="Downloading 4-bit models"):
        try:
            path = snapshot_download(
                repo_id=card.modelId,
                revision=REVISION,
                local_dir=LOCAL_DIR,
                allow_patterns=ALLOW_PATTERNS,
                ignore_patterns=IGNORE_PATTERNS,
                token=HF_TOKEN,
                local_files_only=False,
                resume_download=True,
            )
            downloaded.append((card.modelId, path))
        except Exception as e:
            print(f"[WARN] Failed to download {card.modelId}: {e}")
    return downloaded

def verify_with_unsloth(models):
    # CPU-only check to avoid GPU usage
    import torch
    from unsloth import FastLanguageModel

    torch.set_default_device("cpu")

    verified = []
    for model_id, _ in tqdm(models, desc="Verifying with Unsloth (CPU)"):
        try:
            _m, _t = FastLanguageModel.from_pretrained(
                model_name=model_id,
                max_seq_length=8,    # tiny for a smoke test
                load_in_4bit=True,   # ensure 4-bit path
                dtype=None,          # let Unsloth decide
                device_map={"": "cpu"},
            )
            verified.append(model_id)
        except Exception as e:
            print(f"[WARN] Unsloth load failed for {model_id}: {e}")
    return verified

def main():
    models = discover_unsloth_4bit_models()
    if not models:
        print("No 4-bit models found under the 'unsloth' org with the current filters.")
        return

    print("Discovered 4-bit models:")
    for c in models:
        print(" -", c.modelId)

    # downloads = pull_models(models)
    # print("\nDownloaded:")
    # for mid, path in downloads:
    #     print(f" - {mid} -> {path}")

    # if VERIFY_WITH_UNSLOTH and downloads:
    #     ok = verify_with_unsloth(downloads)
    #     print("\nVerified with Unsloth (CPU):")
    #     for mid in ok:
    #         print(" -", mid)

if __name__ == "__main__":
    main()

Discovered 4-bit models:
 - unsloth/c4ai-command-r-08-2024-bnb-4bit
 - unsloth/c4ai-command-r-plus-08-2024-bnb-4bit
 - unsloth/codegemma-2b-bnb-4bit
 - unsloth/codegemma-7b-bnb-4bit
 - unsloth/codegemma-7b-it-bnb-4bit
 - unsloth/codellama-13b-bnb-4bit
 - unsloth/codellama-34b-bnb-4bit
 - unsloth/codellama-7b-bnb-4bit
 - unsloth/Cosmos-Reason1-7B-bnb-4bit
 - unsloth/Cosmos-Reason1-7B-unsloth-bnb-4bit
 - unsloth/DeepScaleR-1.5B-Preview-unsloth-bnb-4bit
 - unsloth/DeepSeek-Prover-V2-7B-bnb-4bit
 - unsloth/DeepSeek-Prover-V2-7B-unsloth-bnb-4bit
 - unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit
 - unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Llama-70B-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Llama-70B-unsloth-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Llama-8B-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit
 - unsloth/DeepSeek-R1-Distill-

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from typing import Optional
from huggingface_hub import hf_hub_download, HfApi
# ---- Compat shim for HfHubHTTPError across hub versions ----
try:
    from huggingface_hub.errors import HfHubHTTPError  # modern versions
except Exception:
    try:
        from huggingface_hub.utils import HfHubHTTPError  # older versions
    except Exception:
        class HfHubHTTPError(Exception):
            pass
# ------------------------------------------------------------

import json, os
from pathlib import Path
from tqdm import tqdm
import pandas as pd

REPOS = [
    "unsloth/c4ai-command-r-08-2024-bnb-4bit",
    "unsloth/c4ai-command-r-plus-08-2024-bnb-4bit",
    "unsloth/codegemma-2b-bnb-4bit",
    "unsloth/codegemma-7b-bnb-4bit",
    "unsloth/codegemma-7b-it-bnb-4bit",
    "unsloth/codellama-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/codellama-7b-bnb-4bit",
    "unsloth/Cosmos-Reason1-7B-bnb-4bit",
    "unsloth/Cosmos-Reason1-7B-unsloth-bnb-4bit",
    "unsloth/DeepScaleR-1.5B-Preview-unsloth-bnb-4bit",
    "unsloth/DeepSeek-Prover-V2-7B-bnb-4bit",
    "unsloth/DeepSeek-Prover-V2-7B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit",
    "unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Llama-70B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Llama-70B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Llama-8B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-7B-bnb-4bit",
    "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit",
    "unsloth/Devstral-Small-2505-unsloth-bnb-4bit",
    "unsloth/gemma-1.1-2b-it-bnb-4bit",
    "unsloth/gemma-1.1-7b-it-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",
    "unsloth/gemma-2-27b-it-bnb-4bit",
    "unsloth/gemma-2-2b-bnb-4bit",
    "unsloth/gemma-2-2b-it-bnb-4bit",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-9b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/gemma-3-12b-it-bnb-4bit",
    "unsloth/gemma-3-12b-it-qat-bnb-4bit",
    "unsloth/gemma-3-12b-it-qat-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-pt-bnb-4bit",
    "unsloth/gemma-3-12b-pt-unsloth-bnb-4bit",
    "unsloth/gemma-3-1b-it-bnb-4bit",
    "unsloth/gemma-3-1b-it-qat-bnb-4bit",
    "unsloth/gemma-3-1b-it-qat-unsloth-bnb-4bit",
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-1b-pt-bnb-4bit",
    "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit",
    "unsloth/gemma-3-270m-bnb-4bit",
    "unsloth/gemma-3-270m-it-bnb-4bit",
    "unsloth/gemma-3-270m-it-qat-bnb-4bit",
    "unsloth/gemma-3-270m-it-qat-unsloth-bnb-4bit",
    "unsloth/gemma-3-270m-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-270m-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-bnb-4bit",
    "unsloth/gemma-3-27b-it-qat-bnb-4bit",
    "unsloth/gemma-3-27b-it-qat-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-pt-bnb-4bit",
    "unsloth/gemma-3-27b-pt-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-bnb-4bit",
    "unsloth/gemma-3-4b-it-qat-bnb-4bit",
    "unsloth/gemma-3-4b-it-qat-int4-bnb-4bit",
    "unsloth/gemma-3-4b-it-qat-int4-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-pt-bnb-4bit",
    "unsloth/gemma-3-4b-pt-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E4B-unsloth-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/GLM-4-32B-0414-bnb-4bit",
    "unsloth/GLM-4-32B-0414-unsloth-bnb-4bit",
    "unsloth/GLM-4-9B-0414-bnb-4bit",
    "unsloth/GLM-4-9B-0414-unsloth-bnb-4bit",
    "unsloth/GLM-Z1-32B-0414-bnb-4bit",
    "unsloth/GLM-Z1-9B-0414-bnb-4bit",
    "unsloth/GLM-Z1-9B-0414-unsloth-bnb-4bit",
    "unsloth/gpt-oss-120b-bnb-4bit",
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b-bnb-4bit",
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
    "unsloth/Hermes-2-Pro-Mistral-7B-bnb-4bit",
    "unsloth/Hermes-3-Llama-3.1-70B-bnb-4bit",
    "unsloth/Hermes-3-Llama-3.1-8B-bnb-4bit",
    "unsloth/KernelLLM-bnb-4bit",
    "unsloth/KernelLLM-unsloth-bnb-4bit",
    "unsloth/LFM2-1.2B-unsloth-bnb-4bit",
    "unsloth/LFM2-350M-unsloth-bnb-4bit",
    "unsloth/LFM2-700M-unsloth-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-7b-chat-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/llama-3-70b-Instruct-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/Llama-3.1-8B-bnb-4bit",
    "unsloth/Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Llama-3.1-8B-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-3.1-8B-unsloth-bnb-4bit",
    "unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit",
    "unsloth/Llama-3.1-Nemotron-Nano-4B-v1.1-bnb-4bit",
    "unsloth/Llama-3.1-Nemotron-Nano-4B-v1.1-unsloth-bnb-4bit",
    "unsloth/Llama-3.1-Storm-8B-bnb-4bit",
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-11B-Vision-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-1B-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-1B-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-3B-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit",
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",
    "unsloth/Magistral-Small-2506-bnb-4bit",
    "unsloth/Magistral-Small-2506-unsloth-bnb-4bit",
    "unsloth/Magistral-Small-2509-bnb-4bit",
    "unsloth/Magistral-Small-2509-unsloth-bnb-4bit",
    "unsloth/medgemma-27b-text-it-unsloth-bnb-4bit",
    "unsloth/medgemma-4b-it-bnb-4bit",
    "unsloth/medgemma-4b-it-unsloth-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-unsloth-bnb-4bit",
    "unsloth/meta-Llama-3.1-8B-unsloth-bnb-4bit",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.1-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/mistral-7b-v0.2-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/Mistral-Large-Instruct-2407-bnb-4bit",
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/Mistral-Small-24B-Base-2501-bnb-4bit",
    "unsloth/Mistral-Small-24B-Base-2501-unsloth-bnb-4bit",
    "unsloth/Mistral-Small-24B-Instruct-2501-bnb-4bit",
    "unsloth/Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409-bnb-4bit",
    "unsloth/OLMo-2-0425-1B-Instruct-bnb-4bit",
    "unsloth/OLMo-2-0425-1B-Instruct-unsloth-bnb-4bit",
    "unsloth/OpenHermes-2.5-Mistral-7B-bnb-4bit",
    "unsloth/Phi-3-medium-4k-instruct-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct-v0-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct-bnb-4bit",
    "unsloth/phi-4-bnb-4bit",
    "unsloth/Phi-4-mini-instruct-bnb-4bit",
    "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit",
    "unsloth/Phi-4-mini-reasoning-bnb-4bit",
    "unsloth/Phi-4-mini-reasoning-unsloth-bnb-4bit",
    "unsloth/phi-4-reasoning-bnb-4bit",
    "unsloth/Phi-4-reasoning-plus-bnb-4bit",
    "unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit",
    "unsloth/phi-4-reasoning-unsloth-bnb-4bit",
    "unsloth/phi-4-unsloth-bnb-4bit",
    "unsloth/Pixtral-12B-2409-bnb-4bit",
    "unsloth/Pixtral-12B-2409-unsloth-bnb-4bit",
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",
    "unsloth/Qwen2-0.5B-bnb-4bit",
    "unsloth/Qwen2-0.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2-1.5B-bnb-4bit",
    "unsloth/Qwen2-1.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2-72B-bnb-4bit",
    "unsloth/Qwen2-72B-Instruct-bnb-4bit",
    "unsloth/Qwen2-7B-bnb-4bit",
    "unsloth/Qwen2-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2-VL-72B-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-7B-bnb-4bit",
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-0.5B-bnb-4bit",
    "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-0.5B-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-1.5B-bnb-4bit",
    "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-1.5B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-1.5B-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-14B-bnb-4bit",
    "unsloth/Qwen2.5-14B-Instruct-1M-bnb-4bit",
    "unsloth/Qwen2.5-14B-Instruct-1M-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-14B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-14B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-14B-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-32B-bnb-4bit",
    "unsloth/Qwen2.5-32B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-3B-bnb-4bit",
    "unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-3B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-3B-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-72B-bnb-4bit",
    "unsloth/Qwen2.5-72B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-7B-bnb-4bit",
    "unsloth/Qwen2.5-7B-Instruct-1M-bnb-4bit",
    "unsloth/Qwen2.5-7B-Instruct-1M-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-7B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-7B-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-Coder-0.5B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Coder-1.5B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Coder-14B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Coder-32B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-32B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Coder-3B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Coder-7B-bnb-4bit",
    "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Math-1.5B-bnb-4bit",
    "unsloth/Qwen2.5-Math-1.5B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Math-72B-bnb-4bit",
    "unsloth/Qwen2.5-Math-72B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-Math-7B-bnb-4bit",
    "unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-32B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-32B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-3B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-72B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit",
    "unsloth/Qwen3-0.6B-Base-bnb-4bit",
    "unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-0.6B-bnb-4bit",
    "unsloth/Qwen3-0.6B-unsloth-bnb-4bit",
    "unsloth/Qwen3-1.7B-Base-bnb-4bit",
    "unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-1.7B-bnb-4bit",
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-Base-bnb-4bit",
    "unsloth/Qwen3-14B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-30B-A3B-Base-bnb-4bit",
    "unsloth/Qwen3-30B-A3B-bnb-4bit",
    "unsloth/Qwen3-32B-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",
    "unsloth/Qwen3-4B-Base-bnb-4bit",
    "unsloth/Qwen3-4B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-4B-bnb-4bit",
    "unsloth/Qwen3-4B-Instruct-2507-bnb-4bit",
    "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit",
    "unsloth/Qwen3-4B-Thinking-2507-bnb-4bit",
    "unsloth/Qwen3-4B-Thinking-2507-unsloth-bnb-4bit",
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-Base-bnb-4bit",
    "unsloth/Qwen3-8B-Base-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit",
    "unsloth/r1-1776-distill-llama-70b-unsloth-bnb-4bit",
    "unsloth/Reflection-Llama-3.1-70B-bnb-4bit",
    "unsloth/Seed-Coder-8B-Instruct-bnb-4bit",
    "unsloth/Seed-Coder-8B-Instruct-unsloth-bnb-4bit",
    "unsloth/Seed-Coder-8B-Reasoning-bnb-4bit",
    "unsloth/Seed-Coder-8B-Reasoning-unsloth-bnb-4bit",
    "unsloth/SmolLM-1.7B-bnb-4bit",
    "unsloth/SmolLM-1.7B-Instruct-bnb-4bit",
    "unsloth/SmolLM-135M-bnb-4bit",
    "unsloth/SmolLM-135M-Instruct-bnb-4bit",
    "unsloth/SmolLM-360M-bnb-4bit",
    "unsloth/SmolLM-360M-Instruct-bnb-4bit",
    "unsloth/SmolLM2-1.7B-bnb-4bit",
    "unsloth/SmolLM2-1.7B-Instruct-bnb-4bit",
    "unsloth/SmolLM2-135M-bnb-4bit",
    "unsloth/SmolLM2-135M-Instruct-bnb-4bit",
    "unsloth/SmolLM2-360M-bnb-4bit",
    "unsloth/SmolLM2-360M-Instruct-bnb-4bit",
    "unsloth/SmolLM3-3B-Base-bnb-4bit",
    "unsloth/SmolLM3-3B-Base-unsloth-bnb-4bit",
    "unsloth/SmolLM3-3B-bnb-4bit",
    "unsloth/SmolLM3-3B-unsloth-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/tinyllama-chat-bnb-4bit",
    "unsloth/yi-34b-bnb-4bit",
    "unsloth/yi-34b-chat-bnb-4bit",
    "unsloth/yi-6b-bnb-4bit",
    "unsloth/zephyr-sft-bnb-4bit",
    "unsloth/aya-vision-32b-bnb-4bit",
    "unsloth/aya-vision-32b-unsloth-bnb-4bit",
    "unsloth/aya-vision-8b-bnb-4bit",
    "unsloth/aya-vision-8b-unsloth-bnb-4bit",
    "unsloth/c4ai-command-a-03-2025-bnb-4bit",
    "unsloth/c4ai-command-a-03-2025-unsloth-bnb-4bit",
    "unsloth/Devstral-Small-2505-bnb-4bit",
    "unsloth/Devstral-Small-2507-bnb-4bit",
    "unsloth/Devstral-Small-2507-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-qat-int4-bnb-4bit",
    "unsloth/gemma-3-12b-it-qat-int4-unsloth-bnb-4bit",
    "unsloth/gemma-7b-unsloth-bnb-4bit",
    "unsloth/granite-3.2-2b-instruct-bnb-4bit",
    "unsloth/granite-3.2-2b-instruct-unsloth-bnb-4bit",
    "unsloth/granite-3.2-8b-instruct-bnb-4bit",
    "unsloth/granite-3.2-8b-instruct-unsloth-bnb-4bit",
    "unsloth/granite-vision-3.2-2b-bnb-4bit",
    "unsloth/granite-vision-3.2-2b-unsloth-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-unsloth-bnb-4bit",
    "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit",
    "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit",
    "unsloth/Llama-4-Scout-17B-16E-unsloth-bnb-4bit",
    "unsloth/Llama-4-Scout-17B-16E-unsloth-dynamic-bnb-4bit",
    "unsloth/Magistral-Small-2507-bnb-4bit",
    "unsloth/Magistral-Small-2507-unsloth-bnb-4bit",
    "unsloth/medgemma-27b-text-it-bnb-4bit",
    "unsloth/Mistral-Small-3.1-24B-Base-2503-bnb-4bit",
    "unsloth/Mistral-Small-3.1-24B-Base-2503-unsloth-bnb-4bit",
    "unsloth/Mistral-Small-3.1-24B-Instruct-2503-bnb-4bit",
    "unsloth/Mistral-Small-3.1-24B-Instruct-2503-unsloth-bnb-4bit",
    "unsloth/Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit",
    "unsloth/Mistral-Small-3.2-24B-Instruct-2506-unsloth-bnb-4bit",
    "unsloth/Mixtral-8x7B-Instruct-v0.1-bnb-4bit",
    "unsloth/Mixtral-8x7B-Instruct-v0.1-unsloth-bnb-4bit",
    "unsloth/Mixtral-8x7B-v0.1-bnb-4bit",
    "unsloth/Mixtral-8x7B-v0.1-unsloth-bnb-4bit",
    "unsloth/OLMo-2-0325-32B-Instruct-bnb-4bit",
    "unsloth/OLMo-2-0325-32B-Instruct-unsloth-bnb-4bit",
    "unsloth/orpheus-3b-0.1-ft-bnb-4bit",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit",
    "unsloth/orpheus-3b-0.1-pretrained-bnb-4bit",
    "unsloth/orpheus-3b-0.1-pretrained-unsloth-bnb-4bit",
    "unsloth/QVQ-72B-Preview-bnb-4bit",
    "unsloth/Qwen2-VL-2B-bnb-4bit",
    "unsloth/Qwen2.5-0.5-bnb-4bit",
    "unsloth/Qwen3-30B-A3B-128K-bnb-4bit",
    "unsloth/QwQ-32B-bnb-4bit",
    "unsloth/QwQ-32B-Preview-bnb-4bit",
    "unsloth/QwQ-32B-Preview-unsloth-bnb-4bit",
    "unsloth/QwQ-32B-unsloth-bnb-4bit",
    "unsloth/reka-flash-3-unsloth-bnb-4bit",
    "unsloth/Yi-1.5-6B-bnb-4bit",
]

HF_TOKEN = os.environ.get("HF_TOKEN", None)  # optional for gated/private models

def load_json(repo_id: str, filename: str):
    try:
        path = hf_hub_download(repo_id=repo_id, filename=filename, token=HF_TOKEN)
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except HfHubHTTPError:
        return None
    except Exception:
        return None

def effective_context_from_rope(cfg: dict) -> Optional[int]:
    rope = cfg.get("rope_scaling") or {}
    # {"type": "...", "factor": 8.0, "original_max_position_embeddings": 8192}
    try:
        orig = rope.get("original_max_position_embeddings")
        factor = rope.get("factor")
        if isinstance(orig, int) and (isinstance(factor, (int, float)) and factor > 0):
            return int(orig * factor)
    except Exception:
        pass
    return None

def guess_context_window(cfg: dict, tokcfg: Optional[dict]) -> Optional[int]:
    """
    Preference order:
      1) rope_scaling => original_max_position_embeddings * factor
      2) max_position_embeddings (or common synonyms)
      3) tokenizer.model_max_length (finite)
      4) block_size (GLM-like)
    """
    if not isinstance(cfg, dict):
        cfg = {}

    # 1) RoPE scaling effective length
    ctx = effective_context_from_rope(cfg)
    if isinstance(ctx, int) and ctx > 0:
        return ctx

    # 2) Direct config keys
    for k in [
        "max_position_embeddings",
        "n_positions",
        "seq_length",
        "max_sequence_length",
        "max_seq_len",
        "context_length",
        "model_max_length",  # sometimes mirrors tokenizer cap
    ]:
        v = cfg.get(k)
        if isinstance(v, int) and v > 0:
            return v

    # 3) Tokenizer fallback
    if tokcfg:
        mv = tokcfg.get("model_max_length")
        if isinstance(mv, int) and 0 < mv < 10**9:
            return mv

    # 4) Last-ditch
    v = cfg.get("block_size")
    if isinstance(v, int) and v > 0:
        return v

    return None

def tokenizer_max_len(tokcfg: Optional[dict]) -> Optional[int]:
    if not tokcfg:
        return None
    v = tokcfg.get("model_max_length")
    if isinstance(v, int) and 0 < v < 10**9:
        return v
    # Some tokenizers set a huge sentinel (e.g., 1e30); ignore those.
    return None

def analyze_repo(repo_id: str) -> dict:
    cfg = load_json(repo_id, "config.json") or {}
    tokcfg = load_json(repo_id, "tokenizer_config.json") or {}

    ctx = guess_context_window(cfg, tokcfg)
    tok_max = tokenizer_max_len(tokcfg)

    # Max input tokens: prefer tokenizer cap if present; else use context.
    max_input = tok_max if (tok_max is not None and (ctx is None or tok_max <= ctx)) else ctx

    # Theoretical max output tokens: upper bound with empty prompt
    max_output_theoretical = max_input

    return {
        "repo_id": repo_id,
        "arch": cfg.get("_name_or_path") or cfg.get("model_type"),
        "model_type": cfg.get("model_type"),
        "max_context_window": ctx,
        "max_input_tokens": max_input,
        "max_output_tokens_theoretical": max_output_theoretical,
        "tokenizer_model_max_length": tok_max,
        "rope_scaling": cfg.get("rope_scaling"),
        "sliding_window": cfg.get("sliding_window"),
    }

def main():
    rows = []
    for rid in tqdm(REPOS, desc="Scanning HF repos"):
        try:
            rows.append(analyze_repo(rid))
        except Exception as e:
            rows.append({
                "repo_id": rid,
                "arch": None,
                "model_type": None,
                "max_context_window": None,
                "max_input_tokens": None,
                "max_output_tokens_theoretical": None,
                "tokenizer_model_max_length": None,
                "rope_scaling": None,
                "sliding_window": None,
                "error": str(e),
            })

    df = pd.DataFrame(rows)
    cols = [
        "repo_id",
        "model_type",
        "arch",
        "max_context_window",
        "max_input_tokens",
        "max_output_tokens_theoretical",
        "tokenizer_model_max_length",
        "rope_scaling",
        "sliding_window",
        "error",
    ]
    for c in cols:
        if c not in df.columns:
            df[c] = None
    df = df[cols]

    out = Path("unsloth_4bit_context_windows.csv")
    df.to_csv(out, index=False)
    print(df.to_string(index=False, max_colwidth=80))
    print(f"\nSaved: {out.resolve()}")

if __name__ == "__main__":
    main()


Scanning HF repos:  32%|‚ñà‚ñà‚ñà‚ñè      | 112/355 [00:24<01:04,  3.78it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  32%|‚ñà‚ñà‚ñà‚ñè      | 113/355 [00:25<01:14,  3.23it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  32%|‚ñà‚ñà‚ñà‚ñè      | 114/355 [00:25<01:25,  2.83it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  32%|‚ñà‚ñà‚ñà‚ñè      | 115/355 [00:26<01:30,  2.66it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  33%|‚ñà‚ñà‚ñà‚ñé      | 116/355 [00:26<01:28,  2.70it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  33%|‚ñà‚ñà‚ñà‚ñé      | 117/355 [00:26<01:31,  2.59it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  33%|‚ñà‚ñà‚ñà‚ñé      | 118/355 [00:27<01:41,  2.33it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  34%|‚ñà‚ñà‚ñà‚ñç      | 120/355 [00:28<01:27,  2.68it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  34%|‚ñà‚ñà‚ñà‚ñç      | 121/355 [00:28<01:24,  2.78it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  34%|‚ñà‚ñà‚ñà‚ñç      | 122/355 [00:28<01:24,  2.75it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  35%|‚ñà‚ñà‚ñà‚ñç      | 123/355 [00:29<01:27,  2.66it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  35%|‚ñà‚ñà‚ñà‚ñç      | 124/355 [00:29<01:27,  2.63it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  35%|‚ñà‚ñà‚ñà‚ñå      | 125/355 [00:30<01:33,  2.46it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  35%|‚ñà‚ñà‚ñà‚ñå      | 126/355 [00:30<01:33,  2.46it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  36%|‚ñà‚ñà‚ñà‚ñå      | 127/355 [00:30<01:30,  2.53it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  36%|‚ñà‚ñà‚ñà‚ñå      | 128/355 [00:31<01:28,  2.56it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  36%|‚ñà‚ñà‚ñà‚ñã      | 129/355 [00:31<01:35,  2.37it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  37%|‚ñà‚ñà‚ñà‚ñã      | 130/355 [00:32<01:50,  2.04it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  37%|‚ñà‚ñà‚ñà‚ñã      | 131/355 [00:32<01:43,  2.17it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos:  37%|‚ñà‚ñà‚ñà‚ñã      | 132/355 [00:33<01:37,  2.29it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Scanning HF repos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 355/355 [00:55<00:00,  6.42it/s]

                                                        repo_id  model_type                                      arch  max_context_window  max_input_tokens  max_output_tokens_theoretical  tokenizer_model_max_length                                                                     rope_scaling  sliding_window error
                        unsloth/c4ai-command-r-08-2024-bnb-4bit      cohere        CohereForAI/c4ai-command-r-08-2024            131072.0          131072.0                       131072.0                         NaN                                                                             None             NaN  None
                   unsloth/c4ai-command-r-plus-08-2024-bnb-4bit      cohere   CohereForAI/c4ai-command-r-plus-08-2024            131072.0          131072.0                       131072.0                         NaN                                                                             None             NaN  None
                                  unsloth/code




In [4]:
# Compare Llama 8B vs Llama 3B on Mars question
df = run_comparison(
    models_to_compare=["llama-8b", "llama-3b"],
    question="What is Mars and where is Mars?"
)

üöÄ Starting Model Comparison
Question: What is Mars and where is Mars?
Models: ['llama-8b', 'llama-3b']
üñ•Ô∏è System: NVIDIA GeForce RTX 4090 Laptop GPU with 17.17GB VRAM

Testing: Llama 3.1 8B (4-bit)

üì¶ Loading Llama 3.1 8B (4-bit)...
Path: unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
Loading tokenizer...
Loading model weights...


W0924 17:08:44.659000 39876 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


‚úÖ Model loaded! Using 5.70GB VRAM
Generating response...

Testing: Llama 3.2 3B (4-bit)

üì¶ Loading Llama 3.2 3B (4-bit)...
Path: unsloth/Llama-3.2-3B-Instruct-bnb-4bit
Loading tokenizer...
Loading model weights...
‚úÖ Model loaded! Using 2.25GB VRAM
Generating response...


Model,Words,Sentences,Vocab Diversity,Relevance,Completeness,Coherence,Overall Score,Gen Time (s),Tokens/sec
Llama 3.1 8B (4-bit),212,17,0.731,1.0,1.0,0.178,0.782,44.94,6.4
Llama 3.2 3B (4-bit),235,12,0.672,1.0,1.0,0.326,0.8,39.41,7.6


In [5]:
# Compare different models with custom question
df = run_comparison(
    models_to_compare=["tinyllama", "mistral"],
    question="What is Mars and where is Mars?"
)

üöÄ Starting Model Comparison
Question: What is Mars and where is Mars?
Models: ['tinyllama', 'mistral']
üñ•Ô∏è System: NVIDIA GeForce RTX 4090 Laptop GPU with 17.17GB VRAM

Testing: TinyLlama 1.1B

üì¶ Loading TinyLlama 1.1B...
Path: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Loading tokenizer...
Loading model weights...
‚úÖ Model loaded! Using 2.21GB VRAM
Generating response...

Testing: Mistral 7B (4-bit)

üì¶ Loading Mistral 7B (4-bit)...
Path: unsloth/mistral-7b-instruct-v0.2-bnb-4bit
Loading tokenizer...
Loading model weights...
‚úÖ Model loaded! Using 4.13GB VRAM
Generating response...


Model,Words,Sentences,Vocab Diversity,Relevance,Completeness,Coherence,Overall Score,Gen Time (s),Tokens/sec
TinyLlama 1.1B,100,4,0.79,1.0,1.0,0.237,0.805,8.72,17.2
Mistral 7B (4-bit),175,10,0.651,1.0,1.0,0.426,0.815,36.93,6.8
