# Llama Voice Inference — Gradio

Load your fine-tuned Llama 3.1 8B (LoRA adapter) and generate from it via a Gradio web UI.

## Setup

1. **Set runtime to T4 GPU**
2. **Run all cells** — Gradio launches in Cell 4 with a shareable link
3. Open the link, type a prompt, get an essay in your voice

In [None]:
# === Cell 1: Install + mount ===
!pip install -q transformers peft bitsandbytes accelerate gradio

import torch
from google.colab import drive
drive.mount("/content/drive")

DRIVE_BASE = "/content/drive/MyDrive/voice-ft"
ADAPTER_PATH = f"{DRIVE_BASE}/adapters/llama-voice-v1"

GEN_KWARGS = dict(
    temperature=0.8,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1,
    do_sample=True,
)

In [None]:
# === Cell 2: Load model ===
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get("HF_TOKEN"))

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("Loading base Llama in 4-bit...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, quantization_config=bnb_config, device_map="auto",
)

print("Applying LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH)

print(f"Ready on {model.device}")

In [None]:
# === Cell 3: Generate function ===

def generate(prompt, max_tokens):
    messages = [{"role": "user", "content": prompt}]
    input_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    with torch.no_grad():
        output = model.generate(**inputs, **GEN_KWARGS, max_new_tokens=int(max_tokens))
    return tokenizer.decode(output[0][input_len:], skip_special_tokens=True)

In [None]:
# === Cell 4: Launch Gradio ===
import gradio as gr

gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="Prompt", lines=4, placeholder="Write an essay about..."),
        gr.Slider(128, 2048, value=1024, step=128, label="Max tokens"),
    ],
    outputs=gr.Textbox(label="Output", lines=30),
    title="Lily's Voice — Llama 3.1 8B (LoRA)",
    description="Fine-tuned on Lily's Substack essays. Type a prompt, get a draft in her voice.",
    examples=[
        ["Write a personal Substack Note about class in America, from the perspective of a Chinese first-gen immigrant whose family is lower-middle class.", 512],
        ["Write an essay about why dark humor saved my life.", 1024],
        ["Write a Substack Note about the difference between Eileen Gu and Alyssa Liu.", 512],
    ],
).launch(share=True)