In [None]:
# üìã Step 1: Verify Kaggle GPU Environment
!nvidia-smi
print("\n" + "="*60)
!nvcc --version

In [None]:
# üì¶ Step 2: Install Build Dependencies
!apt-get update -qq
!apt-get install -y -qq cmake ninja-build ccache
!pip install -q huggingface_hub tqdm requests

In [None]:
# üì• Step 3: Clone llama.cpp (Latest)
import os
os.chdir('/kaggle/working')

!git clone --depth 1 https://github.com/ggml-org/llama.cpp.git
%cd llama.cpp
!git log -1 --oneline

In [None]:
# üî® Step 4: Build llama.cpp with CUDA 12 for Dual T4 (SM 7.5)
import os
os.chdir('/kaggle/working/llama.cpp')

# Clean any previous build
!rm -rf build

# Configure with CMake
# Key flags:
# - GGML_CUDA=ON: Enable CUDA backend
# - CMAKE_CUDA_ARCHITECTURES=75: Tesla T4 (Turing)
# - GGML_CUDA_FA_ALL_QUANTS=ON: FlashAttention for all quantization types
# - BUILD_SHARED_LIBS=OFF: Static linking for portability
# - LLAMA_SERVER_SSL=OFF: No SSL (simpler for local use)

!cmake -B build -G Ninja \
    -DGGML_CUDA=ON \
    -DCMAKE_CUDA_ARCHITECTURES="75" \
    -DGGML_CUDA_FA_ALL_QUANTS=ON \
    -DGGML_NATIVE=OFF \
    -DBUILD_SHARED_LIBS=OFF \
    -DLLAMA_BUILD_EXAMPLES=ON \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_SERVER=ON \
    -DCMAKE_BUILD_TYPE=Release

print("\n‚úÖ CMake configuration complete!")

In [None]:
# üèóÔ∏è Step 5: Compile (this takes ~5-10 minutes)
import os
os.chdir('/kaggle/working/llama.cpp')

# Build with all available CPU cores
!cmake --build build --config Release -j$(nproc)

print("\n‚úÖ Build complete!")

In [None]:
# üìã Step 6: Verify Built Binaries
import os
os.chdir('/kaggle/working/llama.cpp/build/bin')

print("Built binaries:")
!ls -lh llama-*

print("\n" + "="*60)
print("Testing llama-server version:")
!./llama-server --version 2>/dev/null || echo "Version flag not supported"

print("\n" + "="*60)
print("Binary sizes:")
!du -sh llama-server llama-cli llama-quantize

In [None]:
# üì¶ Step 7: Package Binaries for llcuda v2.1.2
import os
import shutil
from datetime import datetime

os.chdir('/kaggle/working')

# Create package directory structure
PACKAGE_NAME = f"llcuda-v2.1.2-cuda12-kaggle-t4x2-{datetime.now().strftime('%Y%m%d')}"
PACKAGE_DIR = f"/kaggle/working/{PACKAGE_NAME}"

os.makedirs(f"{PACKAGE_DIR}/bin", exist_ok=True)
os.makedirs(f"{PACKAGE_DIR}/lib", exist_ok=True)

# Copy binaries
BUILD_BIN = "/kaggle/working/llama.cpp/build/bin"
binaries = [
    "llama-server",
    "llama-cli", 
    "llama-quantize",
    "llama-embedding",
    "llama-gguf",
    "llama-gguf-hash",
    "llama-imatrix",
    "llama-export-lora",
    "llama-tokenize",
    "llama-infill",
    "llama-perplexity",
]

for binary in binaries:
    src = f"{BUILD_BIN}/{binary}"
    if os.path.exists(src):
        shutil.copy2(src, f"{PACKAGE_DIR}/bin/{binary}")
        print(f"‚úÖ Copied {binary}")
    else:
        print(f"‚ö†Ô∏è Not found: {binary}")

# Copy shared libraries if any
BUILD_LIB = "/kaggle/working/llama.cpp/build"
for lib in ["libllama.so", "libggml.so"]:
    src = f"{BUILD_LIB}/{lib}"
    if os.path.exists(src):
        shutil.copy2(src, f"{PACKAGE_DIR}/lib/{lib}")
        print(f"‚úÖ Copied {lib}")

print(f"\nüì¶ Package created: {PACKAGE_NAME}")

In [None]:
# üìù Step 8: Create Package Metadata
import json
import subprocess

os.chdir('/kaggle/working')

# Get llama.cpp commit hash
os.chdir('/kaggle/working/llama.cpp')
commit_hash = subprocess.getoutput('git rev-parse HEAD')
commit_date = subprocess.getoutput('git log -1 --format=%ci')

metadata = {
    "package": "llcuda",
    "version": "2.1.2",
    "build_date": datetime.now().isoformat(),
    "target_platform": "kaggle",
    "gpu_config": {
        "count": 2,
        "model": "Tesla T4",
        "vram_per_gpu_gb": 15,
        "total_vram_gb": 30,
        "compute_capability": "7.5",
        "architecture": "Turing"
    },
    "cuda": {
        "version": subprocess.getoutput('nvcc --version | grep release | sed "s/.*release //" | cut -d, -f1'),
        "architectures": ["sm_75"],
        "flash_attention": True
    },
    "llama_cpp": {
        "commit": commit_hash,
        "commit_date": commit_date,
        "repo": "https://github.com/ggml-org/llama.cpp"
    },
    "binaries": binaries,
    "features": [
        "multi-gpu-tensor-split",
        "flash-attention",
        "openai-compatible-api",
        "all-quantization-formats",
        "lora-adapters",
        "grammar-constraints",
        "embeddings",
        "reranking",
        "streaming"
    ]
}

os.chdir('/kaggle/working')
with open(f"{PACKAGE_DIR}/metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("‚úÖ Metadata written")
print(json.dumps(metadata, indent=2))

In [None]:
# üìö Step 9: Create Quick Start Guide
readme_content = '''# llcuda v2.1.2 - Kaggle 2√ó Tesla T4 Build

Pre-built CUDA 12 binaries for Kaggle's dual Tesla T4 GPU configuration.

## Quick Start

```bash
# Extract and set permissions
tar -xzf llcuda-v2.1.2-cuda12-kaggle-t4x2-*.tar.gz
chmod +x bin/*

# Run with dual GPU tensor splitting
./bin/llama-server \
    -m /path/to/model.gguf \
    -ngl 99 \
    --split-mode layer \
    --tensor-split 0.5,0.5 \
    -fa \
    --host 0.0.0.0 \
    --port 8080
```

## Multi-GPU Configuration

| Flag | Description |
|------|-------------|
| `-ngl 99` | Offload all layers to GPU |
| `--split-mode layer` | Split model layers across GPUs |
| `--tensor-split 0.5,0.5` | Equal VRAM split between GPUs |
| `-fa` | Enable FlashAttention |

## Python Usage

```python
from llcuda.api import LlamaCppClient, kaggle_t4_dual_config

# Get optimal config for Kaggle dual T4
config = kaggle_t4_dual_config()
print(config.to_cli_args())

# Connect to server
client = LlamaCppClient("http://localhost:8080")

# Chat completion (OpenAI-compatible)
response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Hello!"}],
    max_tokens=100
)
print(response.choices[0].message.content)
```

## Recommended Models for 30GB VRAM

| Model | Quantization | Size | Fits? |
|-------|--------------|------|-------|
| Llama 3.1 70B | Q4_K_M | ~40GB | ‚ùå |
| Llama 3.1 70B | IQ3_XS | ~25GB | ‚úÖ |
| Llama 3.1 8B | Q8_0 | ~9GB | ‚úÖ |
| Qwen2.5 32B | Q4_K_M | ~19GB | ‚úÖ |
| Qwen2.5 14B | Q8_0 | ~15GB | ‚úÖ |
| Gemma 2 27B | Q4_K_M | ~16GB | ‚úÖ |

## Build Info

- **CUDA Version:** 12.4
- **Architecture:** SM 7.5 (Turing)
- **FlashAttention:** Enabled
- **Target:** Kaggle 2√ó Tesla T4
'''

with open(f"{PACKAGE_DIR}/README.md", "w") as f:
    f.write(readme_content)

print("‚úÖ README.md created")

In [None]:
# üì¶ Step 10: Create Distribution Archive
import os
os.chdir('/kaggle/working')

!tar -czvf {PACKAGE_NAME}.tar.gz {PACKAGE_NAME}

print(f"\n" + "="*60)
!ls -lh {PACKAGE_NAME}.tar.gz
print("="*60)
print(f"\n‚úÖ Package ready: {PACKAGE_NAME}.tar.gz")
print(f"\nüì• Download from Kaggle Output tab or use:")
print(f"   !cp /kaggle/working/{PACKAGE_NAME}.tar.gz /kaggle/output/")

In [None]:
# üß™ Step 11: Test Multi-GPU Server (Optional)
# Download a small model and test

from huggingface_hub import hf_hub_download
import subprocess
import time
import requests

# Download a small test model
print("Downloading test model...")
model_path = hf_hub_download(
    repo_id="lmstudio-community/gemma-2-2b-it-GGUF",
    filename="gemma-2-2b-it-Q4_K_M.gguf",
    cache_dir="/kaggle/working/models"
)
print(f"‚úÖ Model downloaded: {model_path}")

# Start server with multi-GPU config
print("\nStarting llama-server with dual T4 config...")
server_cmd = [
    f"{PACKAGE_DIR}/bin/llama-server",
    "-m", model_path,
    "-ngl", "99",
    "--split-mode", "layer",
    "--tensor-split", "0.5,0.5",
    "-fa",
    "--host", "127.0.0.1",
    "--port", "8080",
    "-c", "4096"
]

server = subprocess.Popen(
    server_cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Wait for server to start
print("Waiting for server to start...")
for i in range(30):
    try:
        r = requests.get("http://127.0.0.1:8080/health", timeout=2)
        if r.status_code == 200:
            print(f"‚úÖ Server ready! ({i+1}s)")
            break
    except:
        time.sleep(1)
else:
    print("‚ö†Ô∏è Server startup timeout")

# Test inference
print("\nTesting inference...")
response = requests.post(
    "http://127.0.0.1:8080/v1/chat/completions",
    json={
        "messages": [{"role": "user", "content": "Say hello!"}],
        "max_tokens": 50
    }
)

if response.status_code == 200:
    result = response.json()
    print(f"‚úÖ Response: {result['choices'][0]['message']['content']}")
else:
    print(f"‚ùå Error: {response.status_code}")

# Cleanup
server.terminate()
print("\nüõë Server stopped")

In [None]:
# üìä Final Summary
print("="*60)
print("  llcuda v2.1.2 Build Complete!")
print("="*60)
print(f"\nüì¶ Package: {PACKAGE_NAME}.tar.gz")
!ls -lh /kaggle/working/{PACKAGE_NAME}.tar.gz

print("\nüìÅ Contents:")
!ls -la /kaggle/working/{PACKAGE_NAME}/bin/

print("\nüéØ Target: Kaggle 2√ó Tesla T4 (30 GB VRAM)")
print("   CUDA: 12.4 | SM: 7.5 | FlashAttention: ‚úÖ")
print("\nüìñ See README.md for usage instructions")