In [None]:
# 4_inference_offline.ipynb
"""
Run the fine-tuned QLoRA model offline for local text generation or translation.
"""

In [None]:
# 📦 Step 1: Install if needed (use pre-downloaded wheels if offline)
!pip install transformers peft bitsandbytes

In [None]:
# 🧠 Step 2: Load tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch

model_path = "../models/llama3-8b-qlora-output"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16
)

In [None]:
# 🗣️ Step 3: Create generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


In [None]:
# 🧪 Step 4: Generate responses (adapt prompts to your language)
prompt = "Translate this sentence into Kanien’kéha: The water is cold."

results = generator(prompt, max_length=128, num_return_sequences=1, do_sample=True)

print("🗨️ Generated Response:")
print(results[0]['generated_text'])

# Optional: Save outputs for team review
with open("../models/output_log.txt", "a", encoding="utf-8") as f:
    f.write(f"PROMPT: {prompt}\nRESPONSE: {results[0]['generated_text']}\n\n")