# Identify Model Requirements and think which of these will do better

## Model Optimisations

### Baseline Inference

In [None]:
import time
import numpy as np
import torch
from pathlib import Path

from litgpt.lora import GPT
from litgpt.tokenizer import Tokenizer
from litgpt.prompts import PromptStyle
import lightning as L
from transformers import AutoTokenizer

# === Config ===
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
prompt_text = "What food do llamas eat?"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# === Style the prompt
prompt_style = PromptStyle.from_name("alpaca")
prompt = prompt_style.apply(prompt_text)

# === Tokenize
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
input_ids = inputs["input_ids"]  # shape: [1, seq_len]

# === Load model
print("🔄 Loading model...")
t0 = time.perf_counter()
torch.set_default_dtype(torch.bfloat16)
model = GPT.from_name(name=model_name)
print(f"✅ Loaded in {time.perf_counter() - t0:.2f} seconds")

# === Finalize model
model = model.to(device)
model.eval()
model.max_seq_length = input_ids.shape[-1]
model.set_kv_cache(batch_size=16)
model.cos, model.sin = model.rope_cache(device=device)
L.seed_everything(42, verbose=False)

# === Single-sample Benchmark
print("\n################ Single Inference Benchmark ################")

# Warm-up
for _ in range(5):
    _ = model(input_ids)

# Measure latency
num_trials = 100
latencies = []
for _ in range(num_trials):
    start = time.time()
    _ = model(input_ids)
    latencies.append(time.time() - start)
latencies = np.array(latencies)

# Model size (float32 equivalent)
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / 1e6

print("\n🧪 Lit-GPT TinyLLaMA — Single Inference")
print(f"Model Size: {model_size:.2f} MB (approx)")
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# === Batch Throughput Benchmark
print("\n################ Batch Inference Benchmark ################")

batch_size = 16
batch_input = input_ids.repeat(batch_size, 1)  # shape: [batch, seq_len]
batch_latencies = []
num_batches = 50

# Warm-up
for _ in range(5):
    _ = model(batch_input)

# Measure batch latencies
for _ in range(num_batches):
    start = time.time()
    _ = model(batch_input)
    batch_latencies.append(time.time() - start)

total_tokens = batch_size * batch_input.shape[1] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print("\n📦 Lit-GPT TinyLLaMA — Batch Inference")
print(f"Batch Size: {batch_size}")
print(f"Total Tokens: {total_tokens}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Avg Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")


#### Compiled Model

In [None]:
###add

model.compile()

##this line to above code and run again

Quantised Model

In [None]:
import torch
from transformers import AutoTokenizer
from litgpt.lora import GPT
from litgpt.prompts import PromptStyle
import os

# === Config ===
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
checkpoint_path = f"../../checkpoints/{model_name}/lit_model.pth"
quant_path = "../../quantized_litgpt_tinyllama.pt"
device = "cpu"

# === Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

prompt = "Symptoms: fever and cough\nQuestion: What should I do?\nAnswer:"
prompt_style = PromptStyle.from_name("alpaca")
prompt = prompt_style.apply(prompt)
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]

# === Load Lit-GPT model
# with torch.device("meta"):
model = GPT.from_name(name=model_name)

state_dict = torch.load(checkpoint_path, map_location="cpu")
model = model.to_empty(device="cpu")
model.load_state_dict(state_dict, strict=False)
model.eval()

# === Quantize using PyTorch dynamic quant
print("⚡ Quantizing model...")
quantized_core = torch.quantization.quantize_dynamic(
    model,  # ✅ quantize transformer core only
    {torch.nn.Linear},
    dtype=torch.qint8
)

# === Save quantized transformer core
torch.save(quantized_core.state_dict(), quant_path)
print(f"✅ Saved quantized Lit-GPT TinyLLaMA core model to: {quant_path}")



Inference on Quantised

In [None]:
import time
import numpy as np
import torch
from transformers import AutoTokenizer
from litgpt.lora import GPT
from litgpt.prompts import PromptStyle
import lightning as L

# === Config
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
checkpoint_path = f"../../checkpoints/{model_name}/lit_model.pth"
quant_path = "../../quantized_litgpt_tinyllama.pt"
device = "cpu"  # dynamic quant runs on CPU

# === Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# === Prompt
prompt_text = "What food do llamas eat?"
prompt = PromptStyle.from_name("alpaca").apply(prompt_text)
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]

# === Load GPT wrapper
with torch.device("meta"):
    model = GPT.from_name(name=model_name)

# Restore model with original shape
model = model.to_empty(device="cpu")
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"), strict=False)

# === Inject quantized transformer core
print("🔄 Loading quantized core...")
quantized_core = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_core.load_state_dict(torch.load(quant_path))
model.model = quantized_core  # swap in quantized core
model.eval()
model.max_seq_length = input_ids.shape[-1]
model.set_kv_cache(batch_size=16)
model.cos, model.sin = model.rope_cache(device="cpu")
L.seed_everything(42, verbose=False)

# === Single-sample Benchmark
print("\n################ Single Inference Benchmark ################")
for _ in range(5):  # warm-up
    _ = model(input_ids)

num_trials = 100
latencies = []
for _ in range(num_trials):
    start = time.time()
    _ = model(input_ids)
    latencies.append(time.time() - start)

latencies = np.array(latencies)
print(f"Median Latency: {np.percentile(latencies, 50)*1000:.2f} ms")
print(f"95th Percentile: {np.percentile(latencies, 95)*1000:.2f} ms")
print(f"99th Percentile: {np.percentile(latencies, 99)*1000:.2f} ms")
print(f"Throughput: {num_trials / np.sum(latencies):.2f} req/sec")

# === Batch Inference Benchmark
print("\n################ Batch Inference Benchmark ################")
batch_size = 16
num_batches = 50
batch_input = input_ids.repeat(batch_size, 1)
batch_latencies = []

# Warm-up
for _ in range(5):
    _ = model(batch_input)

# Benchmark
for _ in range(num_batches):
    start = time.time()
    _ = model(batch_input)
    batch_latencies.append(time.time() - start)

total_tokens = batch_size * batch_input.shape[1] * num_batches
batch_fps = total_tokens / np.sum(batch_latencies)

print(f"Batch Size: {batch_size}")
print(f"Total Tokens: {total_tokens}")
print(f"Batch Throughput: {batch_fps:.2f} tokens/sec")
print(f"Average Batch Latency: {np.mean(batch_latencies)*1000:.2f} ms")

