# Build llama-cpp-python Wheel for Colab (Optimized)

This notebook builds a CUDA-accelerated llama-cpp-python wheel that matches Colab's environment.

**Build Optimizations** (from PR #135):
- `CMAKE_BUILD_PARALLEL_LEVEL`: Uses parallel compilation (max 4 jobs on Colab to avoid OOM)
- `CMAKE_CUDA_ARCHITECTURES`: Targets specific GPU (T4 = sm_75) for faster builds and smaller binary
- **Anti-idle**: Keeps Colab connected during long builds
- Estimated build time: **7-10 minutes**

**Output structure** (matches `whisperjav/translate/local_backend.py`):
```
llama-cpp-python/
  cu126/
    llama_cpp_python-{version}-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
```

**Steps:**
1. Verify Colab environment (CUDA version, GPU architecture, Python version)
2. Install build dependencies + enable anti-idle
3. Clone llama-cpp-python (JamePeng's fork)
4. Build wheel with CUDA support (optimized)
5. Prepare wheel with correct manylinux tag
6. Test the wheel
7. Download wheel (to your PC for manual upload)
8. Upload to HuggingFace `mei986/whisperjav-wheels`

**Tips:**
- Keep the browser tab **active and visible** during build
- If Colab disconnects, check `/content/llama-cpp-python/dist/` - the wheel may have been built

---

In [None]:
#@title Step 1: Verify Colab Environment { display-mode: "form" }

import subprocess
import sys
import os
import re
import multiprocessing

print("="*60)
print("COLAB ENVIRONMENT CHECK")
print("="*60)

# Check GPU and CUDA
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader"],
    capture_output=True, text=True
)
if result.returncode != 0:
    print("ERROR: No GPU detected!")
    print("Go to Runtime -> Change runtime type -> T4 GPU")
    raise SystemExit("GPU required for CUDA wheel build")
print(f"GPU: {result.stdout.strip()}")

# Detect GPU compute capability for optimized builds
cuda_arch = None
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
    capture_output=True, text=True
)
if result.returncode == 0:
    cap = result.stdout.strip().split('\n')[0].strip()
    if '.' in cap:
        major, minor = cap.split('.')
        cuda_arch = f"{major}{minor}"
        print(f"GPU Compute Capability: {cap} (sm_{cuda_arch})")

# Fallback: infer from GPU name if compute_cap not available
if not cuda_arch:
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        gpu_name = result.stdout.strip().lower()
        # Map GPU series to compute capability
        if any(x in gpu_name for x in ["rtx 40", "rtx 50", "ada", "l40"]):
            cuda_arch = "89"  # Ada Lovelace
        elif any(x in gpu_name for x in ["rtx 30", "a100", "a10", "a30", "a40"]):
            cuda_arch = "86"  # Ampere
        elif any(x in gpu_name for x in ["rtx 20", "gtx 16", "t4", "quadro rtx"]):
            cuda_arch = "75"  # Turing (Colab T4)
        elif any(x in gpu_name for x in ["v100", "titan v"]):
            cuda_arch = "70"  # Volta
        elif any(x in gpu_name for x in ["gtx 10", "p100", "p40", "p4"]):
            cuda_arch = "61"  # Pascal
        if cuda_arch:
            print(f"GPU Architecture (inferred): sm_{cuda_arch}")

if not cuda_arch:
    print("WARNING: Could not detect GPU architecture, will build for all architectures (slower)")

# Get CUDA version from nvcc
result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True)
if result.returncode == 0:
    match = re.search(r'release (\d+)\.(\d+)', result.stdout)
    if match:
        cuda_major = match.group(1)
        cuda_minor = match.group(2)
        cuda_version = f"{cuda_major}.{cuda_minor}"
        cuda_tag = f"cu{cuda_major}{cuda_minor}"  # e.g., cu126
    else:
        cuda_version = "unknown"
        cuda_tag = "cu126"  # default for Colab
    print(f"CUDA Version: {cuda_version}")
    print(f"Backend Tag: {cuda_tag}")
else:
    print("WARNING: nvcc not found, assuming CUDA 12.6")
    cuda_version = "12.6"
    cuda_tag = "cu126"

# Python version
py_major = sys.version_info.major
py_minor = sys.version_info.minor
py_tag = f"cp{py_major}{py_minor}"
print(f"\nPython: {py_major}.{py_minor} ({py_tag})")

# Platform - use manylinux tag for compatibility
import platform
arch = platform.machine()
if arch == "x86_64":
    plat_tag = "manylinux_2_17_x86_64.manylinux2014_x86_64"
else:
    plat_tag = f"linux_{arch}"
print(f"Platform: {plat_tag}")

# Compute optimal parallel build level
# Colab has ~12GB RAM - too many parallel nvcc jobs can cause OOM
# Use conservative setting: max 4 jobs for Colab to avoid memory issues
cores = multiprocessing.cpu_count()
# Detect if running on Colab
is_colab = 'COLAB_GPU' in os.environ or os.path.exists('/content')
if is_colab:
    # Conservative for Colab: max 4 parallel jobs to avoid OOM
    parallel_level = min(4, max(2, cores // 2))
    print(f"\nColab detected: using conservative parallel level")
else:
    # Local machine: use 75% of cores
    parallel_level = max(2, min(16, int(cores * 0.75)))

print(f"CPU Cores: {cores}")
print(f"Parallel Build Level: {parallel_level} jobs")

# Store config for later cells
BUILD_CONFIG = {
    'cuda_version': cuda_version,
    'cuda_tag': cuda_tag,
    'cuda_arch': cuda_arch,  # e.g., "75" for T4
    'py_major': py_major,
    'py_minor': py_minor,
    'py_tag': py_tag,
    'plat_tag': plat_tag,
    'parallel_level': parallel_level,
    'is_colab': is_colab,
    # HuggingFace config (must match local_backend.py)
    'hf_repo': 'mei986/whisperjav-wheels',
    'hf_repo_type': 'dataset',
}

print("\n" + "="*60)
expected_wheel = f"llama_cpp_python-VERSION-{py_tag}-{py_tag}-{plat_tag}.whl"
print(f"Expected wheel name: {expected_wheel}")
print(f"Upload path: llama-cpp-python/{cuda_tag}/")
print("="*60)

In [None]:
#@title Step 2: Install Build Dependencies { display-mode: "form" }

import subprocess
import sys

print("="*60)
print("INSTALLING BUILD DEPENDENCIES")
print("="*60)

# Install build tools
packages = [
    "build",
    "wheel",
    "setuptools>=61.0",
    "scikit-build-core[pyproject]>=0.5.0",
    "cmake>=3.21",
    "ninja",
    "auditwheel",  # For repairing wheel tags
]

for pkg in packages:
    print(f"Installing {pkg}...")
    result = subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", pkg],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  WARNING: {result.stderr.strip()[:100]}")
    else:
        print(f"  OK")

# Verify cmake
result = subprocess.run(["cmake", "--version"], capture_output=True, text=True)
if result.returncode == 0:
    print(f"\ncmake: {result.stdout.strip().split(chr(10))[0]}")
else:
    print("ERROR: cmake not available")
    raise SystemExit("cmake required")

# Set up anti-idle to prevent Colab from disconnecting during long builds
print("\n" + "-"*60)
print("Setting up anti-idle to prevent Colab timeout...")
from IPython.display import display, Javascript
display(Javascript('''
    function ClickConnect(){
        console.log("Keeping Colab alive... " + new Date().toLocaleTimeString());
    }
    // Click every 60 seconds to prevent idle timeout
    window.colabAntiIdle = setInterval(ClickConnect, 60000);
    console.log("Anti-idle enabled - Colab will stay connected during build");
'''))
print("Anti-idle enabled (pings every 60s)")

print("\n" + "="*60)
print("Build dependencies: READY")
print("="*60)

In [None]:
#@title Step 3: Clone llama-cpp-python { display-mode: "form" }

import subprocess
import os
import re

# Using JamePeng's fork for better CUDA support
REPO_URL = "https://github.com/JamePeng/llama-cpp-python.git"
REPO_PATH = "/content/llama-cpp-python"

print("="*60)
print("CLONING LLAMA-CPP-PYTHON")
print("="*60)

if os.path.exists(REPO_PATH):
    print(f"Removing existing clone at {REPO_PATH}...")
    subprocess.run(["rm", "-rf", REPO_PATH], check=True)

print(f"Cloning {REPO_URL}...")
result = subprocess.run(
    ["git", "clone", "--recursive", REPO_URL, REPO_PATH],
    capture_output=True, text=True
)

if result.returncode != 0:
    print(f"ERROR: Clone failed")
    print(result.stderr)
    raise SystemExit("Clone failed")

print(f"Cloned to {REPO_PATH}")

# Get version from pyproject.toml
os.chdir(REPO_PATH)
with open("pyproject.toml", "r") as f:
    content = f.read()
    match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content)
    version = match.group(1) if match else "unknown"

BUILD_CONFIG['llama_version'] = version
print(f"\nVersion: {version}")

# Get latest commit
result = subprocess.run(["git", "log", "-1", "--oneline"], capture_output=True, text=True)
print(f"Commit: {result.stdout.strip()}")

print("\n" + "="*60)
print("Repository: READY")
print("="*60)

In [None]:
#@title Step 4: Build Wheel with CUDA Support (Optimized) { display-mode: "form" }
#@markdown Uses parallel builds and targeted GPU architecture for faster compilation.
#@markdown Build time: ~5-7 minutes (down from ~10-15 with optimizations).

import subprocess
import os
import time

REPO_PATH = "/content/llama-cpp-python"
os.chdir(REPO_PATH)

print("="*60)
print("BUILDING WHEEL WITH CUDA SUPPORT (OPTIMIZED)")
print("="*60)

# Get build configuration from Step 1
cuda_arch = BUILD_CONFIG.get('cuda_arch')
parallel_level = BUILD_CONFIG.get('parallel_level', 4)

print(f"\nBuild Optimizations:")
print(f"  • Parallel jobs: {parallel_level}")
if cuda_arch:
    print(f"  • Target architecture: sm_{cuda_arch}")
else:
    print(f"  • Target architecture: all (no specific GPU detected)")

# Build CMAKE_ARGS with optimizations
cmake_args_parts = ["-DGGML_CUDA=on"]
if cuda_arch:
    # Target specific GPU architecture for faster build and smaller binary
    cmake_args_parts.append(f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch}")

cmake_args = " ".join(cmake_args_parts)
print(f"  • CMAKE_ARGS: {cmake_args}")

print(f"\nBuilding... (estimated {5 if cuda_arch else 10}-{7 if cuda_arch else 15} minutes)")
print("-"*60 + "\n")

# Set build environment for CUDA with optimizations
env = os.environ.copy()
env["CMAKE_ARGS"] = cmake_args
env["CMAKE_BUILD_PARALLEL_LEVEL"] = str(parallel_level)
env["FORCE_CMAKE"] = "1"

start_time = time.time()

# Build wheel using pip wheel
result = subprocess.run(
    ["pip", "wheel", ".", "--no-deps", "-w", "dist/", "-v"],
    env=env,
    capture_output=False,  # Show output in real-time
)

elapsed = time.time() - start_time

if result.returncode != 0:
    print(f"\nERROR: Build failed after {elapsed:.0f}s")
    raise SystemExit("Build failed")

print("\n" + "="*60)
print(f"Build completed in {elapsed:.0f} seconds ({elapsed/60:.1f} minutes)")
print("="*60)

# List built wheels
import glob
wheels = glob.glob(f"{REPO_PATH}/dist/*.whl")
print(f"\nBuilt wheels:")
for w in wheels:
    size_mb = os.path.getsize(w) / (1024 * 1024)
    print(f"  {os.path.basename(w)} ({size_mb:.1f} MB)")

# Store build time for summary
BUILD_CONFIG['build_time_seconds'] = elapsed

In [None]:
#@title Step 5: Prepare Wheel for Upload { display-mode: "form" }
#@markdown Renames wheel to match manylinux tag expected by local_backend.py

import os
import glob
import shutil
import re

REPO_PATH = "/content/llama-cpp-python"
OUTPUT_DIR = "/content/wheels"

print("="*60)
print("PREPARING WHEEL FOR UPLOAD")
print("="*60)

# Create output directory structure matching HuggingFace repo
cuda_tag = BUILD_CONFIG['cuda_tag']
upload_dir = f"{OUTPUT_DIR}/llama-cpp-python/{cuda_tag}"
os.makedirs(upload_dir, exist_ok=True)

# Find the built wheel
wheels = glob.glob(f"{REPO_PATH}/dist/llama_cpp_python*.whl")
if not wheels:
    print("ERROR: No wheel found in dist/")
    raise SystemExit("No wheel found")

original_wheel = wheels[0]
original_name = os.path.basename(original_wheel)
print(f"Original: {original_name}")

# Parse and rebuild wheel name with correct manylinux tag
# Original: llama_cpp_python-{ver}-{py}-{py}-linux_x86_64.whl
# Target:   llama_cpp_python-{ver}-{py}-{py}-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

match = re.match(r'(llama_cpp_python)-([^-]+)-([^-]+)-([^-]+)-(.+)\.whl', original_name)
if match:
    pkg_name = match.group(1)
    version = match.group(2)
    py_tag1 = match.group(3)
    py_tag2 = match.group(4)
    old_plat = match.group(5)
    
    # Use the correct manylinux tag
    new_plat = BUILD_CONFIG['plat_tag']
    new_name = f"{pkg_name}-{version}-{py_tag1}-{py_tag2}-{new_plat}.whl"
    
    print(f"Renaming platform tag: {old_plat} -> {new_plat}")
else:
    print("WARNING: Could not parse wheel name, using original")
    new_name = original_name

# Copy to upload directory
dest_wheel = os.path.join(upload_dir, new_name)
shutil.copy(original_wheel, dest_wheel)

size_mb = os.path.getsize(dest_wheel) / (1024 * 1024)
print(f"\nFinal wheel: {new_name}")
print(f"Size: {size_mb:.1f} MB")
print(f"Location: {dest_wheel}")

BUILD_CONFIG['wheel_path'] = dest_wheel
BUILD_CONFIG['wheel_name'] = new_name
BUILD_CONFIG['upload_dir'] = upload_dir

print("\n" + "="*60)
print(f"Ready for upload to: {BUILD_CONFIG['hf_repo']}")
print(f"Path in repo: llama-cpp-python/{cuda_tag}/{new_name}")
print("="*60)

In [None]:
#@title Step 6: Test the Wheel { display-mode: "form" }

import subprocess
import sys
import os

print("="*60)
print("TESTING THE WHEEL")
print("="*60)

wheel_path = BUILD_CONFIG.get('wheel_path')
if not wheel_path:
    print("ERROR: No wheel path stored. Run previous cells first.")
    raise SystemExit()

# Install the wheel
print(f"Installing {os.path.basename(wheel_path)}...")
result = subprocess.run(
    [sys.executable, "-m", "pip", "install", wheel_path, "--force-reinstall"],
    capture_output=True, text=True
)

if result.returncode != 0:
    print(f"ERROR: Install failed")
    print(result.stderr)
    raise SystemExit("Install failed")
print("Installed successfully")

# Test import and CUDA detection
print("\nTesting import and CUDA...")
test_code = '''
import llama_cpp
print(f"Version: {llama_cpp.__version__}")

# Check if CUDA/cuBLAS is available
try:
    from llama_cpp import Llama
    print("Llama class: OK")
    
    # Check for GPU layers support (indicates CUDA build)
    import inspect
    sig = inspect.signature(Llama.__init__)
    if 'n_gpu_layers' in sig.parameters:
        print("n_gpu_layers parameter: FOUND (CUDA build confirmed)")
    else:
        print("n_gpu_layers parameter: NOT FOUND")
except Exception as e:
    print(f"Error: {e}")
'''

result = subprocess.run(
    [sys.executable, "-c", test_code],
    capture_output=True, text=True
)

print(result.stdout)
if result.stderr:
    print(f"Warnings: {result.stderr[:200]}")

if result.returncode != 0:
    print("\nWARNING: Test had issues, but wheel may still work")
else:
    print("\n" + "="*60)
    print("Wheel test: PASSED")
    print("="*60)

In [None]:
#@title Step 7: Download Wheel (Manual Upload) { display-mode: "form" }
#@markdown Downloads the wheel to your local machine for manual upload.

from google.colab import files
import os

print("="*60)
print("DOWNLOAD WHEEL")
print("="*60)

wheel_path = BUILD_CONFIG.get('wheel_path')
if not wheel_path or not os.path.exists(wheel_path):
    print("ERROR: Wheel not found. Run previous cells first.")
    raise SystemExit()

wheel_name = os.path.basename(wheel_path)
size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
cuda_tag = BUILD_CONFIG['cuda_tag']

print(f"Wheel: {wheel_name}")
print(f"Size:  {size_mb:.1f} MB")
print(f"\nUpload to HuggingFace:")
print(f"  Repo: {BUILD_CONFIG['hf_repo']}")
print(f"  Path: llama-cpp-python/{cuda_tag}/{wheel_name}")
print("\nStarting download...")

files.download(wheel_path)

print("\n" + "="*60)
print("Download started!")
print("="*60)

In [None]:
#@title Step 8: Upload to HuggingFace (Direct) { display-mode: "form" }
#@markdown Enter your HuggingFace token to upload directly.

hf_token = "" #@param {type:"string"}

import subprocess
import sys
import os

if not hf_token:
    print("No token provided. Use Step 7 to download and upload manually.")
    print("\nManual upload instructions:")
    print(f"  1. Go to https://huggingface.co/datasets/{BUILD_CONFIG['hf_repo']}")
    print(f"  2. Navigate to: llama-cpp-python/{BUILD_CONFIG['cuda_tag']}/")
    print(f"  3. Upload: {BUILD_CONFIG['wheel_name']}")
else:
    print("="*60)
    print("UPLOADING TO HUGGINGFACE")
    print("="*60)
    
    # Install huggingface_hub
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "huggingface_hub"], check=True)
    
    from huggingface_hub import HfApi
    
    api = HfApi(token=hf_token)
    
    wheel_path = BUILD_CONFIG['wheel_path']
    wheel_name = BUILD_CONFIG['wheel_name']
    cuda_tag = BUILD_CONFIG['cuda_tag']
    repo_id = BUILD_CONFIG['hf_repo']
    
    # Path in repo: llama-cpp-python/{cuda_tag}/{wheel_name}
    path_in_repo = f"llama-cpp-python/{cuda_tag}/{wheel_name}"
    
    print(f"Repo: {repo_id}")
    print(f"Path: {path_in_repo}")
    print(f"Uploading...")
    
    api.upload_file(
        path_or_fileobj=wheel_path,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="dataset",
        token=hf_token,
    )
    
    print("\n" + "="*60)
    print("UPLOAD COMPLETE!")
    print("="*60)
    print(f"\nWheel is now available at:")
    print(f"  https://huggingface.co/datasets/{repo_id}/tree/main/llama-cpp-python/{cuda_tag}")

In [None]:
#@title Summary & Next Steps { display-mode: "form" }

print("="*60)
print("BUILD SUMMARY")
print("="*60)

print(f"\nllama-cpp-python version: {BUILD_CONFIG.get('llama_version', 'unknown')}")
print(f"CUDA version: {BUILD_CONFIG.get('cuda_version', 'unknown')}")
print(f"Backend tag: {BUILD_CONFIG.get('cuda_tag', 'unknown')}")
print(f"GPU architecture: sm_{BUILD_CONFIG.get('cuda_arch', 'unknown')}")
print(f"Python: {BUILD_CONFIG.get('py_major', '?')}.{BUILD_CONFIG.get('py_minor', '?')}")
print(f"Platform: {BUILD_CONFIG.get('plat_tag', 'unknown')}")

build_time = BUILD_CONFIG.get('build_time_seconds')
if build_time:
    print(f"\nBuild time: {build_time:.0f}s ({build_time/60:.1f} minutes)")
    
print(f"\nBuild optimizations used:")
print(f"  • Parallel jobs: {BUILD_CONFIG.get('parallel_level', 'unknown')}")
cuda_arch = BUILD_CONFIG.get('cuda_arch')
if cuda_arch:
    print(f"  • Target architecture: sm_{cuda_arch} (optimized)")
else:
    print(f"  • Target architecture: all (generic)")

print(f"\nWheel: {BUILD_CONFIG.get('wheel_name', 'not built')}")

print("\n" + "="*60)
print("INTEGRATION WITH WHISPERJAV")
print("="*60)

wheel_name = BUILD_CONFIG.get('wheel_name', 'llama_cpp_python-VERSION-cpXX-cpXX-manylinux.whl')
version = BUILD_CONFIG.get('llama_version', 'X.X.X')
cuda_tag = BUILD_CONFIG.get('cuda_tag', 'cu126')

print(f"""
After uploading to HuggingFace, update local_backend.py:

1. Update WHEEL_VERSION constant:
   WHEEL_VERSION = "{version}"

2. The wheel will be auto-downloaded when users run:
   whisperjav-translate -i input.srt --provider local

3. Wheel download path:
   {BUILD_CONFIG['hf_repo']}/llama-cpp-python/{cuda_tag}/{wheel_name}

4. Verify with:
   python -c "from whisperjav.translate.local_backend import ensure_llama_cpp_installed; ensure_llama_cpp_installed()"
""")

print("="*60)
print("DONE!")
print("="*60)