# Voice Inference — Generate from Fine-Tuned Models

Load your saved fine-tuned models and generate essays/Notes from any prompt.
No training here — just loading weights and writing.

## Prerequisites

1. **Set runtime to GPU** (Runtime → Change runtime type → T4 GPU).
2. **Have saved models on Google Drive** from the training notebooks.

## Cell Map

| Cell | What it does |
|------|--------------|
| 1 | Install dependencies |
| 2 | Mount Drive + config |
| 3 | Load fine-tuned Llama (LoRA adapter) |
| 4 | Generate from Llama |
| 5 | Load fine-tuned GPT-2-XL |
| 6 | Generate from GPT-2 |
| 7 | Side-by-side comparison |
| 8 | Gradio web UI (shareable link) |

In [None]:
# === Cell 1: Install dependencies ===
!pip install -q transformers peft bitsandbytes accelerate gradio

In [None]:
# === Cell 2: Mount Drive + config ===

import torch
from google.colab import drive

drive.mount("/content/drive")

DRIVE_BASE = "/content/drive/MyDrive/voice-ft"

# Paths to saved models — update these if you saved to a different location
LLAMA_ADAPTER_PATH = f"{DRIVE_BASE}/adapters/llama-voice-v1"
GPT2_MODEL_PATH = f"{DRIVE_BASE}/models/gpt2-voice-v1"

# Generation hyperparameters — same as training notebooks
GEN_KWARGS = dict(
    temperature=0.8,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1,
    do_sample=True,
)

print(f"Llama adapter: {LLAMA_ADAPTER_PATH}")
print(f"GPT-2 model: {GPT2_MODEL_PATH}")

In [None]:
# === Cell 3: Load fine-tuned Llama (base model + LoRA adapter) ===

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get("HF_TOKEN"))

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("Loading base Llama in 4-bit...")
llama_base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

print("Applying LoRA adapter...")
llama_model = PeftModel.from_pretrained(llama_base, LLAMA_ADAPTER_PATH)
llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_ADAPTER_PATH)

print(f"Llama loaded on: {llama_model.device}")

In [None]:
# === Cell 4: Generate from Llama ===
# Change the prompt and max_new_tokens to whatever you want.
# Short Note/Tweet: max_new_tokens=256
# Full essay: max_new_tokens=1024 or higher

prompt = "Write an essay about why dark humor saved my life"  # <-- EDIT THIS
max_new_tokens = 1024  # <-- shorter for Notes, longer for essays

messages = [{"role": "user", "content": prompt}]
input_text = llama_tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = llama_tokenizer(input_text, return_tensors="pt").to(llama_model.device)
input_len = inputs["input_ids"].shape[1]

with torch.no_grad():
    output = llama_model.generate(**inputs, **GEN_KWARGS, max_new_tokens=max_new_tokens)

llama_output = llama_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
print(f"Prompt: {prompt}\n")
print(f"{'─'*60}\n")
print(llama_output)

In [None]:
# === Cell 5: Load fine-tuned GPT-2-XL ===

from transformers import AutoModelForCausalLM, AutoTokenizer

print("Loading fine-tuned GPT-2-XL...")
gpt2_model = AutoModelForCausalLM.from_pretrained(GPT2_MODEL_PATH, torch_dtype=torch.float16)
gpt2_tokenizer = AutoTokenizer.from_pretrained(GPT2_MODEL_PATH)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model = gpt2_model.to(device)

print(f"GPT-2-XL loaded on: {device}")

In [None]:
# === Cell 6: Generate from GPT-2 ===
# Uses completion-style format (prompt + separator, then model continues).
# GPT-2 maxes out at ~512 new tokens within its 1,024 context window.

prompt = "Write an essay about why dark humor saved my life"  # <-- EDIT THIS
max_new_tokens = 512

input_text = f"{prompt}\n\n---\n\n"
inputs = gpt2_tokenizer(input_text, return_tensors="pt").to(gpt2_model.device)
input_len = inputs["input_ids"].shape[1]

with torch.no_grad():
    output = gpt2_model.generate(
        **inputs, **GEN_KWARGS,
        max_new_tokens=max_new_tokens,
        pad_token_id=gpt2_tokenizer.eos_token_id,
    )

gpt2_output = gpt2_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
print(f"Prompt: {prompt}\n")
print(f"{'─'*60}\n")
print(gpt2_output)

In [None]:
# === Cell 7: Side-by-side comparison ===
# Run this after generating from both models with the same prompt.

print(f"Prompt: {prompt}")
print(f"\n{'#'*70}")
print(f"  LLAMA (fine-tuned, LoRA)")
print(f"{'#'*70}\n")
print(llama_output)
print(f"\n{'#'*70}")
print(f"  GPT-2-XL (fine-tuned, full)")
print(f"{'#'*70}\n")
print(gpt2_output)

In [None]:
# === Cell 8: Gradio web UI ===
# Launches a web app with a shareable public link (expires after 72 hours).
# Anyone with the link can type prompts and get outputs — no setup needed.
# Uses whichever models are loaded above. If both are loaded, shows both.

import gradio as gr

def generate_llama_fn(prompt, max_tokens):
    messages = [{"role": "user", "content": prompt}]
    input_text = llama_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = llama_tokenizer(input_text, return_tensors="pt").to(llama_model.device)
    input_len = inputs["input_ids"].shape[1]
    with torch.no_grad():
        output = llama_model.generate(**inputs, **GEN_KWARGS, max_new_tokens=int(max_tokens))
    return llama_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)

def generate_gpt2_fn(prompt, max_tokens):
    input_text = f"{prompt}\n\n---\n\n"
    inputs = gpt2_tokenizer(input_text, return_tensors="pt").to(gpt2_model.device)
    input_len = inputs["input_ids"].shape[1]
    with torch.no_grad():
        output = gpt2_model.generate(
            **inputs, **GEN_KWARGS,
            max_new_tokens=int(max_tokens),
            pad_token_id=gpt2_tokenizer.eos_token_id,
        )
    return gpt2_tokenizer.decode(output[0][input_len:], skip_special_tokens=True)

# Build tabs based on which models are loaded
tabs = []

if "llama_model" in dir():
    tabs.append(gr.Interface(
        fn=generate_llama_fn,
        inputs=[
            gr.Textbox(label="Prompt", lines=3, placeholder="Write an essay about..."),
            gr.Slider(128, 2048, value=1024, step=128, label="Max tokens"),
        ],
        outputs=gr.Textbox(label="Output", lines=25),
        title="Llama 3.1 8B — Fine-Tuned (LoRA)",
    ))

if "gpt2_model" in dir():
    tabs.append(gr.Interface(
        fn=generate_gpt2_fn,
        inputs=[
            gr.Textbox(label="Prompt", lines=3, placeholder="Write an essay about..."),
            gr.Slider(128, 512, value=512, step=64, label="Max tokens"),
        ],
        outputs=gr.Textbox(label="Output", lines=25),
        title="GPT-2-XL — Fine-Tuned (Full)",
    ))

if tabs:
    app = gr.TabbedInterface(tabs, tab_names=["Llama", "GPT-2"][:len(tabs)])
    app.launch(share=True)
else:
    print("No models loaded. Run Cell 3 (Llama) and/or Cell 5 (GPT-2) first.")