# Gemma 7B Local Playground (Apple Silicon)

This notebook runs Gemma 7B locally using `llama-cpp-python` (Metal backend).

If you have not installed dependencies yet in your `llm-local` conda env:

```bash
pip install -U -r requirements.txt jupyter ipykernel
python -m ipykernel install --user --name llm-local --display-name "Python (llm-local)"
```


In [None]:
from pathlib import Path
import os
import time
from huggingface_hub import hf_hub_download
from llama_cpp import Llama


In [None]:
# Config
REPO_ID = "bartowski/gemma-2-9b-it-GGUF"
FILENAME = "gemma-2-9b-it-Q4_K_M.gguf"
MODEL_DIR = Path("./models")
HF_TOKEN = os.getenv("HF_TOKEN")  # Optional, required if access-gated

N_CTX = 4096
N_BATCH = 512
N_GPU_LAYERS = -1  # -1 = all layers on Metal
N_THREADS = max(1, (os.cpu_count() or 8) - 2)
MAX_TOKENS_DEFAULT = 200


In [None]:
# Download model if needed
MODEL_DIR.mkdir(parents=True, exist_ok=True)
model_path = hf_hub_download(
    repo_id=REPO_ID,
    filename=FILENAME,
    local_dir=MODEL_DIR,
    token=HF_TOKEN,
)
print(f"Model path: {model_path}")


In [None]:
# Load model
llm = Llama(
    model_path=model_path,
    n_ctx=N_CTX,
    n_batch=N_BATCH,
    n_gpu_layers=N_GPU_LAYERS,
    n_threads=N_THREADS,
    verbose=False,
)

# Warmup call
_ = llm("Warmup.", max_tokens=8, temperature=0.0)
print("Model loaded and warmed up.")


In [None]:
def run_prompt(prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT, temperature: float = 0.2, top_p: float = 0.9):
    """Run a single prompt and print response + basic speed metrics."""
    prompt_tokens = len(llm.tokenize(prompt.encode("utf-8"), add_bos=True))

    start = time.perf_counter()
    first_token_t = None
    out = []
    completion_tokens = 0

    stream = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=True,
    )

    for event in stream:
        tok = event.get("choices", [{}])[0].get("text", "")
        if tok:
            out.append(tok)
            completion_tokens += 1
            if first_token_t is None:
                first_token_t = time.perf_counter()

    end = time.perf_counter()
    total_s = end - start
    ttft_s = (first_token_t - start) if first_token_t is not None else total_s
    decode_s = (end - first_token_t) if first_token_t is not None else 0.0

    decode_tps = completion_tokens / decode_s if decode_s > 0 else 0.0
    e2e_tps = completion_tokens / total_s if total_s > 0 else 0.0

    text = "".join(out).strip()

    print("Prompt:")
    print(prompt)
    print("\nResponse:")
    print(text)
    print("\nMetrics:")
    print(f"prompt_tokens: {prompt_tokens}")
    print(f"completion_tokens: {completion_tokens}")
    print(f"ttft_s: {ttft_s:.3f}")
    print(f"decode_tps: {decode_tps:.2f}")
    print(f"end_to_end_tps: {e2e_tps:.2f}")

    return {
        "prompt": prompt,
        "response": text,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "ttft_s": ttft_s,
        "decode_tps": decode_tps,
        "end_to_end_tps": e2e_tps,
    }

### Throughput benchmark

The helper below measures how many completion tokens per second the model produces for a given prompt.  It reports both end-to-end and decode-only rates, which are useful when comparing configurations.


In [None]:
import time

def measure_throughput(prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT,
                       temperature: float = 0.2, top_p: float = 0.9):
    """Send a prompt and return tokens‑per‑second stats."""
    start = time.perf_counter()
    first_token = None
    out_tokens = 0

    stream = llm(prompt,
                 max_tokens=max_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 stream=True)

    for evt in stream:
        tok = evt.get("choices", [{}])[0].get("text", "")
        if tok:
            out_tokens += 1
            if first_token is None:
                first_token = time.perf_counter()

    end = time.perf_counter()
    total = end - start
    decode = (end - first_token) if first_token is not None else total

    return {
        "prompt_len": len(prompt),
        "completion_tokens": out_tokens,
        "total_s": total,
        "decode_s": decode,
        "e2e_tps": out_tokens / total if total > 0 else float("inf"),
        "decode_tps": out_tokens / decode if decode > 0 else float("inf"),
    }

# example usage with nicer formatting
prompt = "Repeat after me: the quick brown fox jumps over the lazy dog. " * 50
stats = measure_throughput(prompt, max_tokens=500)

print("=== throughput benchmark ===")
print(f"prompt length (chars): {stats['prompt_len']}")
print(f"completion tokens: {stats['completion_tokens']}")
print(f"total elapsed: {stats['total_s']:.3f}s")
print(f"decode phase: {stats['decode_s']:.3f}s")
print(f"end-to-end throughput: {stats['e2e_tps']:.2f} tokens/s")
print(f"decode-only throughput: {stats['decode_tps']:.2f} tokens/s")


In [None]:
# Try prompts
result_1 = run_prompt("Explain transformers to a 12-year-old in 5 bullet points.")

# Add your own:
result_2 = run_prompt("Write a Python function for quicksort with docstring.")
