## Load Libraries

In [1]:
import os, time, psutil, torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Dict, Any
from threading import Thread

## Global Settings

In [2]:
# Configura o número de threads

os.environ["OMP_NUM_THREADS"] = "20"
os.environ["MKL_NUM_THREADS"] = "20"

In [3]:
# Configura o prompt

PROMPT = (
    "Liste e descreva brevemente as principais vantagens e desvantagens "
    "de usar arquitetura de microsserviços em aplicações nativas na nuvem."
)

MAX_NEW_TOKENS  = 250
NUM_REPLICAS    = 5    # quantidade de repetições do modelo
SAMPLE_INTERVAL = 0.1  # segundos entre amostras de CPU/RAM

In [4]:
# Configura o modelo

MODEL_NAMES = [
    # "meta-llama/Llama-2-7b-hf",
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
]

## Auxiliary Functions

In [5]:
def load_model(model_name: str, sample_interval: float = 0.05) -> Dict[str, Any]:
    """
    Carrega tokenizer+modelo em CPU e retorna métricas de:
      - tempo de carregamento
      - pico de RAM (MB) e CPU (%) durante o load
    """
    proc = psutil.Process()
    
    # Limpa objetos anteriores
    proc.cpu_percent(None)
    
    t0 = time.perf_counter()
    mem0 = proc.memory_info().rss / 1024**2

    # carrega
    tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
    model     = AutoModelForCausalLM.from_pretrained(model_name, force_download=True ,device_map="cpu")
    model.eval()

    elapsed = time.perf_counter() - t0
    # amostra por um curto instante após o load para capturar picos
    samples = []
    for _ in range(int(0.5 / sample_interval)):  # 0.5 s de amostragem pós-load
        cpu = proc.cpu_percent(None)
        mem = proc.memory_info().rss / 1024**2
        samples.append((cpu, mem))
        time.sleep(sample_interval)

    cpus, mems = zip(*samples)
    return {
        "tokenizer": tokenizer,
        "model": model,
        "load_time_s": elapsed,
        "load_peak_cpu_pct": max(cpus),
        "load_peak_ram_mb": max(mems) - mem0,
    }


def run_test(model, tokenizer, input_ids, attention_mask, MAX_NEW_TOKENS, SAMPLE_INTERVAL) -> Dict[str, Any]:
    """
    Executa uma única inferência, coletando métricas de tempo e uso de recursos.
    Faz warm-up rápido antes da medição.
    """
    # Warm-up
    with torch.no_grad():
        _ = model.generate(input_ids, max_new_tokens=24)
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Métricas antes da geração
    proc = psutil.Process()
    proc.cpu_percent(None)
    mem0 = proc.memory_info().rss / 1024**2
    t0 = time.perf_counter()

    results = {}
    def gen():
        with torch.no_grad():
            results["outputs"] = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                eos_token_id=None,
                pad_token_id=tokenizer.eos_token_id,
            )

    
    thread = Thread(target=gen, daemon=True)
    thread.start()
    
    samples = []
    while thread.is_alive():
        elapsed = time.perf_counter() - t0
        cpu = proc.cpu_percent(interval=SAMPLE_INTERVAL)
        mem = proc.memory_info().rss / 1024**2
        samples.append({"time": elapsed, "cpu": cpu, "mem": mem})
        time.sleep(SAMPLE_INTERVAL)
    thread.join()

    # métricas finais
    t1 = time.perf_counter()
    
    outputs = results["outputs"]
    total_tokens = outputs.shape[-1] - input_ids.shape[-1]
    elapsed = t1 - t0
    
    cpus = [s["cpu"] for s in samples]
    mems = [s["mem"] for s in samples]

    return {
        "tokens": total_tokens,
        "time_s": elapsed,
        "latency_ms_per_token": elapsed / total_tokens * 1e3,
        "throughput_tps": total_tokens / elapsed,
        "inf_peak_cpu_pct": max(cpus),
        "inf_peak_ram_mb": max(mems) - mem0,
        "samples": samples,
    }

## Generate Benchmark

In [6]:
# Loop para medir uso de CPU e RAM

load_results = []
inf_results  = []

for model_name in MODEL_NAMES:
    print(f"\n==== Modelo: {model_name} ====")
    
    # Carrega o modelo e o tokenizador
    print("Carregando modelo...")
    loaded_model = load_model(model_name, sample_interval=SAMPLE_INTERVAL)
    tokenizer, model = loaded_model["tokenizer"], loaded_model["model"]
    
    # Coleta as métricas de carregamento
    load_results.append({
        "model": model_name,
        "load_time_s": loaded_model["load_time_s"],
        "load_peak_cpu_pct": loaded_model["load_peak_cpu_pct"],
        "load_peak_ram_mb": loaded_model["load_peak_ram_mb"],
    })

    # Tokeniza o prompt
    print("Tokenizando prompt...")
    inputs         = tokenizer(PROMPT, return_tensors="pt", padding=True, truncation=True)
    input_ids      = inputs.input_ids
    attention_mask = inputs.attention_mask

    # Gera as métricas para o modelo
    print("Executando teste...")
    for run in range(NUM_REPLICAS):
        print(f" Run {run}/{NUM_REPLICAS-1}...")
        res = run_test(model, tokenizer, input_ids, attention_mask, MAX_NEW_TOKENS, SAMPLE_INTERVAL)
        res.update({"model": model_name, "run": run})
        inf_results.append(res)


==== Modelo: TinyLlama/TinyLlama-1.1B-Chat-v1.0 ====
Carregando modelo...


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Tokenizando prompt...
Executando teste...
 Run 0/4...
 Run 1/4...
 Run 2/4...
 Run 3/4...
 Run 4/4...


## Analyse Results

In [7]:
# Transforma os resultados em um DataFrame

df_load = pd.DataFrame(load_results)
df_inf  = pd.DataFrame(inf_results)

summary_load = df_load.groupby("model").agg(
    runs=("model","count"),
    avg_load_time_s=("load_time_s","mean"),
    std_load_time_s=("load_time_s","std"),
    avg_peak_cpu_pct=("load_peak_cpu_pct","mean"),
    avg_peak_ram_mb=("load_peak_ram_mb","mean"),
).reset_index()

summary_inf = df_inf.groupby("model").agg(
    runs=("run","count"),
    avg_latency_ms_tok=("latency_ms_per_token","mean"),
    std_latency_ms_tok=("latency_ms_per_token","std"),
    avg_tps=("throughput_tps","mean"),
    std_tps=("throughput_tps","std"),
    avg_inf_peak_cpu=("inf_peak_cpu_pct","mean"),
    avg_inf_peak_ram_mb=("inf_peak_ram_mb","mean"),
).reset_index()

In [None]:
# Exporta os resultados para CSV

df_load.to_csv("../../data/raw/benchmarks/benchmark_carregamento.csv", index=False)
df_inf.to_csv("../../data/raw/benchmarks/benchmark_inferencia.csv", index=False)
summary_inf.to_csv("../../data/raw/benchmarks/benchmark_inferencia_resumo.csv", index=False)