# Build llama-cpp-python Wheel for Colab

This notebook builds a CUDA-accelerated llama-cpp-python wheel that matches Colab's environment.

**Output:** `llama_cpp_python-{version}+cu126-cp312-cp312-linux_x86_64.whl`

**Steps:**
1. Verify Colab environment (CUDA 12.6, Python 3.12)
2. Install build dependencies
3. Clone and build llama-cpp-python with CUDA support
4. Test the wheel
5. Download for upload to HuggingFace

---

In [None]:
#@title Step 1: Verify Colab Environment { display-mode: "form" }

import subprocess
import sys
import os

print("="*60)
print("COLAB ENVIRONMENT CHECK")
print("="*60)

# Check GPU and CUDA
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader"],
    capture_output=True, text=True
)
if result.returncode != 0:
    print("ERROR: No GPU detected!")
    print("Go to Runtime -> Change runtime type -> T4 GPU")
    raise SystemExit("GPU required for CUDA wheel build")
print(f"GPU: {result.stdout.strip()}")

# Get CUDA version from nvcc
result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True)
if result.returncode == 0:
    import re
    match = re.search(r'release (\d+\.\d+)', result.stdout)
    cuda_version = match.group(1) if match else "unknown"
    print(f"CUDA Version: {cuda_version}")
    
    # Extract major.minor for wheel tag
    cuda_tag = f"cu{''.join(cuda_version.split('.')[:2])}"
    print(f"CUDA Tag: {cuda_tag}")
else:
    print("WARNING: nvcc not found, checking nvidia-smi...")
    result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
    match = re.search(r'CUDA Version: (\d+\.\d+)', result.stdout)
    cuda_version = match.group(1) if match else "12.6"
    cuda_tag = f"cu{''.join(cuda_version.split('.')[:2])}"
    print(f"CUDA Version (from driver): {cuda_version}")
    print(f"CUDA Tag: {cuda_tag}")

# Python version
py_version = f"{sys.version_info.major}.{sys.version_info.minor}"
py_tag = f"cp{sys.version_info.major}{sys.version_info.minor}"
print(f"\nPython: {py_version} ({py_tag})")

# Platform
import platform
plat = platform.machine()
print(f"Platform: linux_{plat}")

# Store for later cells
BUILD_CONFIG = {
    'cuda_version': cuda_version,
    'cuda_tag': cuda_tag,
    'py_version': py_version,
    'py_tag': py_tag,
    'platform': f"linux_{plat}",
}

print("\n" + "="*60)
print(f"Target wheel: llama_cpp_python-*+{cuda_tag}-{py_tag}-{py_tag}-linux_{plat}.whl")
print("="*60)

In [None]:
#@title Step 2: Install Build Dependencies { display-mode: "form" }

import subprocess
import sys

print("="*60)
print("INSTALLING BUILD DEPENDENCIES")
print("="*60)

# Install build tools
packages = [
    "build",
    "wheel",
    "setuptools>=61.0",
    "scikit-build-core[pyproject]>=0.5.0",
    "cmake>=3.21",
    "ninja",
]

for pkg in packages:
    print(f"Installing {pkg}...")
    result = subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", pkg],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  WARNING: {result.stderr.strip()[:100]}")
    else:
        print(f"  OK")

# Verify cmake
result = subprocess.run(["cmake", "--version"], capture_output=True, text=True)
if result.returncode == 0:
    print(f"\ncmake: {result.stdout.strip().split(chr(10))[0]}")
else:
    print("ERROR: cmake not available")
    raise SystemExit("cmake required")

print("\n" + "="*60)
print("Build dependencies: READY")
print("="*60)

In [None]:
#@title Step 3: Clone llama-cpp-python { display-mode: "form" }

import subprocess
import os

# Using JamePeng's fork for better CUDA support
REPO_URL = "https://github.com/JamePeng/llama-cpp-python.git"
REPO_PATH = "/content/llama-cpp-python"

print("="*60)
print("CLONING LLAMA-CPP-PYTHON")
print("="*60)

if os.path.exists(REPO_PATH):
    print(f"Removing existing clone at {REPO_PATH}...")
    subprocess.run(["rm", "-rf", REPO_PATH], check=True)

print(f"Cloning {REPO_URL}...")
result = subprocess.run(
    ["git", "clone", "--recursive", REPO_URL, REPO_PATH],
    capture_output=True, text=True
)

if result.returncode != 0:
    print(f"ERROR: Clone failed")
    print(result.stderr)
    raise SystemExit("Clone failed")

print(f"Cloned to {REPO_PATH}")

# Get version from pyproject.toml
os.chdir(REPO_PATH)
import re
with open("pyproject.toml", "r") as f:
    content = f.read()
    match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
    version = match.group(1) if match else "unknown"

BUILD_CONFIG['llama_version'] = version
print(f"\nVersion: {version}")

# Get latest commit
result = subprocess.run(["git", "log", "-1", "--oneline"], capture_output=True, text=True)
print(f"Commit: {result.stdout.strip()}")

print("\n" + "="*60)
print("Repository: READY")
print("="*60)

In [None]:
#@title Step 4: Build Wheel with CUDA Support { display-mode: "form" }
#@markdown This takes ~7-10 minutes. Be patient!

import subprocess
import os
import time

REPO_PATH = "/content/llama-cpp-python"
os.chdir(REPO_PATH)

print("="*60)
print("BUILDING WHEEL WITH CUDA SUPPORT")
print("="*60)
print("This takes ~7-10 minutes. Please wait...\n")

# Set build environment for CUDA
env = os.environ.copy()
env["CMAKE_ARGS"] = "-DGGML_CUDA=on"
env["FORCE_CMAKE"] = "1"

# Optional: Set CUDA architectures (T4 = sm_75, but include common ones)
# env["CMAKE_CUDA_ARCHITECTURES"] = "75;80;86;89;90"

start_time = time.time()

# Build wheel using pip wheel
result = subprocess.run(
    ["pip", "wheel", ".", "--no-deps", "-w", "dist/", "-v"],
    env=env,
    capture_output=False,  # Show output in real-time
)

elapsed = time.time() - start_time

if result.returncode != 0:
    print(f"\nERROR: Build failed after {elapsed:.0f}s")
    raise SystemExit("Build failed")

print("\n" + "="*60)
print(f"Build completed in {elapsed:.0f} seconds")
print("="*60)

# List built wheels
import glob
wheels = glob.glob(f"{REPO_PATH}/dist/*.whl")
print(f"\nBuilt wheels:")
for w in wheels:
    size_mb = os.path.getsize(w) / (1024 * 1024)
    print(f"  {os.path.basename(w)} ({size_mb:.1f} MB)")

In [None]:
#@title Step 5: Rename Wheel with CUDA Tag { display-mode: "form" }

import os
import glob
import shutil

REPO_PATH = "/content/llama-cpp-python"
OUTPUT_DIR = "/content/wheels"

print("="*60)
print("RENAMING WHEEL WITH CUDA TAG")
print("="*60)

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Find the built wheel
wheels = glob.glob(f"{REPO_PATH}/dist/llama_cpp_python*.whl")
if not wheels:
    print("ERROR: No wheel found in dist/")
    raise SystemExit("No wheel found")

original_wheel = wheels[0]
original_name = os.path.basename(original_wheel)
print(f"Original: {original_name}")

# Parse wheel name: llama_cpp_python-{version}-{pytag}-{pytag}-{platform}.whl
# Add CUDA tag to version: llama_cpp_python-{version}+cu126-{pytag}-{pytag}-{platform}.whl
import re
match = re.match(r'(llama_cpp_python)-([^-]+)-(.+)', original_name)
if match:
    pkg_name = match.group(1)
    version = match.group(2)
    rest = match.group(3)
    
    cuda_tag = BUILD_CONFIG.get('cuda_tag', 'cu126')
    new_version = f"{version}+{cuda_tag}"
    new_name = f"{pkg_name}-{new_version}-{rest}"
else:
    print("WARNING: Could not parse wheel name, using original")
    new_name = original_name

# Copy and rename
new_wheel = os.path.join(OUTPUT_DIR, new_name)
shutil.copy(original_wheel, new_wheel)

size_mb = os.path.getsize(new_wheel) / (1024 * 1024)
print(f"New:      {new_name}")
print(f"Size:     {size_mb:.1f} MB")
print(f"Location: {new_wheel}")

BUILD_CONFIG['wheel_path'] = new_wheel
BUILD_CONFIG['wheel_name'] = new_name

print("\n" + "="*60)
print("Wheel renamed: DONE")
print("="*60)

In [None]:
#@title Step 6: Test the Wheel { display-mode: "form" }

import subprocess
import sys
import os

print("="*60)
print("TESTING THE WHEEL")
print("="*60)

wheel_path = BUILD_CONFIG.get('wheel_path')
if not wheel_path:
    print("ERROR: No wheel path stored. Run previous cells first.")
    raise SystemExit()

# Install the wheel
print(f"Installing {os.path.basename(wheel_path)}...")
result = subprocess.run(
    [sys.executable, "-m", "pip", "install", wheel_path, "--force-reinstall"],
    capture_output=True, text=True
)

if result.returncode != 0:
    print(f"ERROR: Install failed")
    print(result.stderr)
    raise SystemExit("Install failed")
print("Installed successfully")

# Test import and CUDA detection
print("\nTesting import and CUDA...")
test_code = '''
import llama_cpp
print(f"Version: {llama_cpp.__version__}")

# Check if CUDA/cuBLAS is available
try:
    # Try to check backend support
    from llama_cpp import Llama
    print("Llama class: OK")
    
    # Check for GPU layers support (indicates CUDA build)
    import inspect
    sig = inspect.signature(Llama.__init__)
    if 'n_gpu_layers' in sig.parameters:
        print("n_gpu_layers parameter: FOUND (CUDA build confirmed)")
    else:
        print("n_gpu_layers parameter: NOT FOUND")
except Exception as e:
    print(f"Error: {e}")
'''

result = subprocess.run(
    [sys.executable, "-c", test_code],
    capture_output=True, text=True
)

print(result.stdout)
if result.stderr:
    print(f"Warnings: {result.stderr[:200]}")

if result.returncode != 0:
    print("\nWARNING: Test had issues, but wheel may still work")
else:
    print("\n" + "="*60)
    print("Wheel test: PASSED")
    print("="*60)

In [None]:
#@title Step 7: Download Wheel { display-mode: "form" }
#@markdown Downloads the wheel to your local machine via browser.

from google.colab import files
import os

print("="*60)
print("DOWNLOAD WHEEL")
print("="*60)

wheel_path = BUILD_CONFIG.get('wheel_path')
if not wheel_path or not os.path.exists(wheel_path):
    print("ERROR: Wheel not found. Run previous cells first.")
    raise SystemExit()

wheel_name = os.path.basename(wheel_path)
size_mb = os.path.getsize(wheel_path) / (1024 * 1024)

print(f"Wheel: {wheel_name}")
print(f"Size:  {size_mb:.1f} MB")
print("\nStarting download...")

files.download(wheel_path)

print("\n" + "="*60)
print("Download started!")
print("="*60)

## Upload to HuggingFace

After downloading the wheel, upload it to HuggingFace:

### Option A: Web UI
1. Go to https://huggingface.co/meizhong986/whisperjav-wheels (create if needed)
2. Click "Add file" -> "Upload files"
3. Upload the `.whl` file

### Option B: CLI
```bash
pip install huggingface_hub
huggingface-cli login
huggingface-cli upload meizhong986/whisperjav-wheels llama_cpp_python-*.whl .
```

### Install URL
Once uploaded, users can install with:
```bash
pip install https://huggingface.co/meizhong986/whisperjav-wheels/resolve/main/llama_cpp_python-{version}+cu126-cp312-cp312-linux_x86_64.whl
```

In [None]:
#@title (Optional) Upload to HuggingFace via CLI { display-mode: "form" }
#@markdown Enter your HuggingFace token to upload directly.

hf_token = "" #@param {type:"string"}
repo_id = "meizhong986/whisperjav-wheels" #@param {type:"string"}

import subprocess
import sys
import os

if not hf_token:
    print("No token provided. Download the wheel manually and upload via web UI.")
else:
    print("="*60)
    print("UPLOADING TO HUGGINGFACE")
    print("="*60)
    
    # Install huggingface_hub
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "huggingface_hub"], check=True)
    
    from huggingface_hub import HfApi, create_repo
    
    api = HfApi(token=hf_token)
    
    # Create repo if it doesn't exist
    try:
        create_repo(repo_id, repo_type="model", exist_ok=True, token=hf_token)
        print(f"Repository: {repo_id}")
    except Exception as e:
        print(f"Note: {e}")
    
    # Upload wheel
    wheel_path = BUILD_CONFIG.get('wheel_path')
    wheel_name = os.path.basename(wheel_path)
    
    print(f"Uploading {wheel_name}...")
    api.upload_file(
        path_or_fileobj=wheel_path,
        path_in_repo=wheel_name,
        repo_id=repo_id,
        repo_type="model",
        token=hf_token,
    )
    
    print("\n" + "="*60)
    print("UPLOAD COMPLETE!")
    print("="*60)
    print(f"\nInstall URL:")
    print(f"pip install https://huggingface.co/{repo_id}/resolve/main/{wheel_name}")

In [None]:
#@title Summary { display-mode: "form" }

print("="*60)
print("BUILD SUMMARY")
print("="*60)

print(f"\nllama-cpp-python version: {BUILD_CONFIG.get('llama_version', 'unknown')}")
print(f"CUDA version: {BUILD_CONFIG.get('cuda_version', 'unknown')}")
print(f"Python version: {BUILD_CONFIG.get('py_version', 'unknown')}")
print(f"Platform: {BUILD_CONFIG.get('platform', 'unknown')}")
print(f"\nWheel: {BUILD_CONFIG.get('wheel_name', 'not built')}")

print("\n" + "="*60)
print("INTEGRATION INSTRUCTIONS")
print("="*60)

wheel_name = BUILD_CONFIG.get('wheel_name', 'llama_cpp_python-VERSION+cu126-cp312-cp312-linux_x86_64.whl')
print(f"""
1. Upload wheel to HuggingFace:
   https://huggingface.co/meizhong986/whisperjav-wheels

2. Update install_colab.sh to install from HuggingFace:
   
   WHEEL_URL="https://huggingface.co/meizhong986/whisperjav-wheels/resolve/main/{wheel_name}"
   uv pip install --python "$VENV_PATH/bin/python" "$WHEEL_URL"

3. Users get fast llama-cpp-python installation (~seconds vs ~7 minutes)
""")