> As 3 primeiras celulas são comuns a todos os notebooks de testes  
**Criar uma venv, e depois instalar os requirements do arquivo requirements.txt.**  
**CUDA versão 12.8**  

In [None]:
import os, time, json, re
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
    print("VRAM (GB):", round(torch.cuda.get_device_properties(0).total_memory/1024**3, 2))

def load_bnb_4bit(model_id: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
    )
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        quantization_config=bnb_config,
    )
    model.eval()
    return tok, model

def format_chat_inst(model_family: str, system: str, user: str):
    if model_family == "mistral":
        return f"<s>[INST] <<SYS>>\n{system}\n<</SYS>>\n{user} [/INST]"
    elif model_family == "gemma":
        return f"<start_of_turn>system\n{system}<end_of_turn>\n<start_of_turn>user\n{user}<end_of_turn>\n<start_of_turn>model\n"
    elif model_family == "phi3":
        return f"<s>[INST] {system}\n\n{user} [/INST]"
    else:
        return user  # fallback

In [None]:
@torch.inference_mode()
def generate_and_measure(tok, model, prompt, max_new_tokens=750, temperature=0.7, top_p=0.9):
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

    inputs = tok(prompt, return_tensors="pt").to(model.device)

    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tok.eos_token_id,
    )

    if device == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    txt = tok.decode(out[0], skip_special_tokens=True)

    # mede VRAM pico
    vram_peak = None
    if device == "cuda":
        vram_peak = torch.cuda.max_memory_allocated() / (1024**3)  # GB

    # tokens gerados
    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    tps = new_tokens / dt if dt > 0 else float("inf")

    return {"text": txt, "latency_s": dt, "new_tokens": new_tokens, "tokens_per_s": tps, "vram_gb_peak": vram_peak}

**Lembre-se de pegar o token da hugging face no site e adicionar abaixo**

In [None]:
token = os.environ["HF_TOKEN"] = ""
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
tok, model = load_bnb_4bit(MODEL_ID)
model_family = "mistral"

**Teste básico de aquecimento:**

In [None]:
system = "Você é um assistente sênior, responda em pt-BR."
user = "Liste 4 cuidados ao comparar LLMs locais."
prompt = format_chat_inst(model_family, system, user)
res = generate_and_measure(tok, model, prompt)
print(res["text"])
res

In [None]:
with open("../data/prompts_baseline.json", "r", encoding="utf-8") as f:
    prompts = json.load(f)

rows = []
system = "Responda em pt-BR, com precisão e sem enrolação."
for p in tqdm(prompts):
    prompt = format_chat_inst(model_family, system, p)
    r = generate_and_measure(tok, model, prompt, max_new_tokens=200)
    rows.append({
        "model_id": MODEL_ID,
        "prompt": p,
        "latency_s": r["latency_s"],
        "tokens_per_s": r["tokens_per_s"],
        "vram_gb_peak": r["vram_gb_peak"],
        "new_tokens": r["new_tokens"],
    })

df = pd.DataFrame(rows)
df.to_csv(f"../results/{MODEL_ID.split('/')[-1]}_metrics.csv", index=False)
df.describe()

In [None]:
import os, json, re
import pandas as pd

NUMBER_FULL = re.compile(r"^\s*-?\d+(?:[.,]\d+)?\s*$")
NUMBER_ANY  = re.compile(r"-?\d+(?:[.,]\d+)?")

def is_number_string(s: str) -> bool:
    """Retorna True se a string é um número (int/float) puro."""
    return bool(NUMBER_FULL.match(str(s)))

def extract_number(text: str):
    """Extrai o primeiro número (int/float) do texto; retorna float ou None."""
    if text is None:
        return None
    m = NUMBER_ANY.search(str(text))
    if not m:
        return None
    try:
        return float(m.group(0).replace(",", "."))
    except Exception:
        return None

def normalize_text(s: str) -> str:
    """Normaliza texto para comparação exata não numérica."""
    return re.sub(r"\s+", " ", str(s or "")).strip().lower()

def ask_and_score_math(
    items,
    *,
    system_prompt="Responda somente com a resposta final.",
    max_new_tokens=64,
    tol=1e-2,
    verbose=False,
):
    logs = []
    acertos = 0

    for idx, item in enumerate(items, 1):
        q = item.get("q", "")
        a = str(item.get("a", "")).strip()

        prompt = format_chat_inst(model_family, system_prompt, q)

        out = generate_and_measure(tok, model, prompt, max_new_tokens=max_new_tokens)
        model_text = (out.get("text") or "").strip()

        if is_number_string(a):
            gold = float(a.replace(",", "."))
            pred = extract_number(model_text)

            correct = False
            if pred is not None:
                try:
                    correct = abs(pred - gold) <= tol
                except Exception:
                    correct = False

            if not correct:
                txt_norm = normalize_text(model_text)
                correct = normalize_text(a) in txt_norm or str(int(gold)) in txt_norm

        else:
            correct = normalize_text(model_text) == normalize_text(a)

        if verbose and not correct:
            print(f"[❌] Q: {q}")
            print(f"    Pred: {model_text}")
            print(f"    Gold: {a}")

        acertos += int(correct)

        logs.append({
            "idx": idx,
            "question": q,
            "gold": a,
            "prediction": model_text,
            "correct": int(correct),
            "latency_s": out.get("latency_s"),
            "tokens_per_s": out.get("tokens_per_s"),
            "vram_gb_peak": out.get("vram_gb_peak"),
        })

        if verbose:
            print(f"[{idx}] OK={correct} | Q: {q}\n  pred: {model_text}\n  gold: {a}\n")

    total = len(items) or 1
    acc = acertos / total

    summary = {
        "model_id": MODEL_ID,
        "n": len(items),
        "acertos": acertos,
        "accuracy": acc,
        "latency_avg_s": pd.Series([r["latency_s"] for r in logs]).mean(),
        "tps_avg": pd.Series([r["tokens_per_s"] for r in logs]).mean(),
        "vram_peak_max_gb": pd.Series([r["vram_gb_peak"] for r in logs]).max(),
    }

    df = pd.DataFrame(logs)
    return summary, df

with open("../data/qa_math_eval.json", "r", encoding="utf-8") as f:
    math_items = json.load(f)   # lista de {"q","a"}

summary, df_logs = ask_and_score_math(math_items, tol=1e-2, verbose=True)

print("== Resumo Math ==")
print({
    "model_id": summary["model_id"],
    "n": summary["n"],
    "acertos": summary["acertos"],
    "accuracy": round(summary["accuracy"], 3),
    "latency_avg_s": round(float(summary["latency_avg_s"] or 0), 3),
    "tps_avg": round(float(summary["tps_avg"] or 0), 2),
    "vram_peak_max_gb": round(float(summary["vram_peak_max_gb"] or 0), 2),
})

os.makedirs("../results", exist_ok=True)

per_model_slug = MODEL_ID.split("/")[-1]
df_logs.to_csv(f"../results/{per_model_slug}_math_logs.csv", index=False)

dfq = pd.DataFrame([{
    "model_id": MODEL_ID,
    "quality_math": summary["accuracy"],
    "quality_fact": None,
    "quality_score": summary["accuracy"]
}])
dfq.to_csv(f"../results/{per_model_slug}_quality.csv", index=False)
