# Notebook Description

- O objetivo desse notebook é testar o uso de recursos para carregamento e inferência em LLMs locais.
- Busca-se coletar métricas de CPU e RAM para modelos de diferentes famílias (Llama, TinyLlama, Qwen, Gemma e Mistral).
- Algumas versões das famílias e modelos são esperadas que retornem erro devido à limitação de RAM.
- A máquina utizada para o teste possuim 32 de RAM e uma CPU Intel Core i5-13600k (14/20).
- As métricas são salvas em arquivos CSV, possibilitando análise posterior.

**Observações**
- É coletado o uso absoluto do sistema, ou seja, não é retirado o que já estava em uso no sistema antes do teste
- Há um overhead causado pelo ResourseMonitor (o que também ocorrerá na ferramenta final)

## Load Libraries

In [1]:
import os, gc, time, threading, psutil, torch, dotenv
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, logging
from huggingface_hub.utils import disable_progress_bars
from typing import List, Dict, Any

## Global Settings

In [None]:
# Configura o ambiente

os.environ["OMP_NUM_THREADS"]      = "12" # Número de threads para OpenMP
os.environ["MKL_NUM_THREADS"]      = "12" # Número de threads para MKL
os.environ["CUDA_VISIBLE_DEVICES"] = ""   # Garante que os processos não usem GPU

In [None]:
# Suprime verbose do Transformers, PyTorch e Hugging Face

# Desabilita as barras de progresso do Hugging Face
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

disable_progress_bars()

logging.set_verbosity_error()
logging.disable_progress_bar()

In [4]:
# Configura o token do huggingface

# Carrega as variáveis de ambiente do arquivo .env
dotenv.load_dotenv("../../env/.env")
access_token = os.getenv("HF_TOKEN")
if not access_token:
    raise RuntimeError("HF_TOKEN não definido!")

# Salva o token no cache do huggingface
from huggingface_hub import HfFolder
HfFolder.save_token(access_token)

In [5]:
# Configura o prompt

PROMPT = (
    "Liste e descreva brevemente as principais vantagens e desvantagens "
    "de usar arquitetura de microsserviços em aplicações nativas na nuvem."
)

MAX_NEW_TOKENS  = 200  # Máximo de tokens a serem gerados
NUM_REPLICAS    = 5    # Quantidade de repetições do modelo
SAMPLE_INTERVAL = 0.1  # Segundos entre amostras de CPU/RAM

In [None]:
# Configura o modelo
# Modelos comentados causam a morte do kerel na máquina especificada

MODEL_NAMES = [
    "meta-llama/Llama-3.2-1B",            # Modelos Llama
    # "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Llama-2-13b-hf",
    "TinyLlama/TinyLlama_v1.1",
    "Qwen/Qwen3-0.6B",                    # Modelos Qwen
    "Qwen/Qwen3-1.7B",
    # "Qwen/Qwen3-4B",
    "Qwen/Qwen3-8B",
    "Qwen/Qwen2.5-0.5B",
    # "Qwen/Qwen2.5-3B",
    "Qwen/Qwen2.5-7B",
    "Qwen/Qwen2.5-14B",
    "google/gemma-3-1b-it",               # Modelos Gemma
    # "google/gemma-3-4b-it",
    "google/gemma-3-12b-it",
    "mistralai/Mistral-7B-Instruct-v0.3", # Modelos Mistral
]

## Resource Monitoring Module

In [None]:
class ResourceMonitor:
    def __init__(self, interval: float = 0.1):
        self.interval = interval
        self._stop_event = threading.Event()
        self._thread = threading.Thread(target=self._run, daemon=True)
        self.samples: List[Dict[str, Any]] = []

    def _run(self):
        proc = psutil.Process()
        while not self._stop_event.is_set():
            # Por processo
            cpu_proc = proc.cpu_percent(None)
            mem_proc = proc.memory_info().rss / 1024**2
            # Por sistema
            cpu_sys = psutil.cpu_percent(None)
            mem_sys = psutil.virtual_memory().used / 1024**2

            timestamp = time.perf_counter()
            self.samples.append({
                "time": timestamp,
                "cpu_proc_pct": cpu_proc,
                "mem_proc_mb": mem_proc,
                "cpu_sys_pct": cpu_sys,
                "mem_sys_mb": mem_sys,
            })
            time.sleep(self.interval)

    def start(self):
        self.samples = []
        self._stop_event.clear()
        psutil.Process().cpu_percent(None)
        psutil.cpu_percent(None)
        self._thread = threading.Thread(target=self._run, daemon=True)
        self._thread.start()

    def stop(self):
        self._stop_event.set()
        self._thread.join()
        return self.samples

## Auxiliary Functions

In [8]:

def load_model(model_name: str, sample_interval: float = 0.05) -> Dict[str, Any]:    
    # Inicia o monitoramento de recursos    
    monitor = ResourceMonitor(interval=sample_interval)
    t0 = time.perf_counter()
    monitor.start()
    
    # Carrega tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        force_download=True,
        token=access_token,
    )
    
    # Carrega modelo
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        force_download=True,
        low_cpu_mem_usage=True,
        token=access_token,
        device_map="cpu"
    )
    
    # Para o monitoramento
    monitor.stop()
    elapsed = time.perf_counter() - t0
    
    # Adiciona token de padding se não existir
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
    
    # Adiciona truncação se não existir
    if tokenizer.model_max_length is None or tokenizer.model_max_length > 10000:
        tokenizer.model_max_length = 512

    # Coleta as amostras de uso de recursos
    samples = monitor.samples
    cpu_proc_max = max(s["cpu_proc_pct"] for s in samples)
    mem_proc_max = max(s["mem_proc_mb"]  for s in samples)
    cpu_sys_max  = max(s["cpu_sys_pct"]  for s in samples)
    mem_sys_max  = max(s["mem_sys_mb"]   for s in samples)
    
    return {
        "tokenizer": tokenizer,
        "model": model,
        "load_time_s": elapsed,
        "load_peak_cpu_proc_pct": cpu_proc_max,
        "load_peak_ram_proc_mb": mem_proc_max,
        "load_peak_cpu_sys_pct": cpu_sys_max,
        "load_peak_ram_sys_mb": mem_sys_max,
        "load_samples": samples,
    }


def run_test(model, tokenizer, input_ids, attention_mask,
             max_new_tokens: int, sample_interval: float) -> Dict[str, Any]:
    # Warm-up
    with torch.no_grad():
        _ = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=24,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Inicia o monitoramento de recursos
    monitor = ResourceMonitor(interval=sample_interval)
    t0 = time.perf_counter()
    monitor.start()

    # Executa a inferência
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )

    # Para o monitoramento
    monitor.stop()
    elapsed = time.perf_counter() - t0
    
    samples = monitor.samples
    cpu_proc_max = max(s["cpu_proc_pct"] for s in samples)
    mem_proc_max = max(s["mem_proc_mb"]  for s in samples)
    cpu_sys_max  = max(s["cpu_sys_pct"]  for s in samples)
    mem_sys_max  = max(s["mem_sys_mb"]   for s in samples)  

    total_tokens = outputs.shape[-1] - input_ids.shape[-1]
    return {
        "tokens": total_tokens,
        "time_s": elapsed,
        "latency_ms_per_token": elapsed/total_tokens*1e3,
        "throughput_tps": total_tokens/elapsed,
        "inf_peak_cpu_proc_pct": cpu_proc_max,
        "inf_peak_ram_proc_mb": mem_proc_max,
        "inf_peak_cpu_sys_pct": cpu_sys_max,
        "inf_peak_ram_sys_mb": mem_sys_max,
        "inf_samples": samples,
    }

## Generate Benchmark

In [None]:
# Loop para medir uso de CPU e RAM
# Erros ao carregar alguns modelos são esperados e não devem interromper o loop

load_results = []
inf_results  = []

for model_name in MODEL_NAMES:
    print(f"\n==== Modelo: {model_name} ====")
    
    # Tenta carregar o modelo; se falhar, pula para o próximo
    try:
        print("Carregando modelo...")
        loaded = load_model(model_name, sample_interval=SAMPLE_INTERVAL)
    except (OSError, RuntimeError) as e:
        print(f"Erro ao carregar {model_name}: {e}\n")
        continue

    # Carrega o tokenizer (model já carregado sem erros)
    tokenizer, model = loaded["tokenizer"], loaded["model"]
    
    # Salva as métricas de carregamento
    load_results.append({
        "model"                  : model_name,
        "load_time_s"            : loaded["load_time_s"],
        "load_peak_cpu_proc_pct" : loaded["load_peak_cpu_proc_pct"],
        "load_peak_ram_proc_mb"  : loaded["load_peak_ram_proc_mb"],
        "load_peak_cpu_sys_pct"  : loaded["load_peak_cpu_sys_pct"],
        "load_peak_ram_sys_mb"   : loaded["load_peak_ram_sys_mb"],
        "load_samples"           : loaded["load_samples"],
    })

    # Tokeniza o prompt
    print("Tokenizando prompt...")
    inputs         = tokenizer(PROMPT, return_tensors="pt", padding=True, truncation=True)
    input_ids      = inputs.input_ids
    attention_mask = inputs.attention_mask

    # Salva as métricas para cada modelo
    print("Executando teste...")
    for run in range(NUM_REPLICAS):
        print(f" Run {run}/{NUM_REPLICAS-1}...")
        
        try:
            res = run_test(
                model, tokenizer,
                input_ids, attention_mask,
                MAX_NEW_TOKENS, SAMPLE_INTERVAL
            )
        except Exception as e:
            print(f"Erro durante a inferência: {e}")
            continue
        
        res.update({"model": model_name, "run": run})
        inf_results.append(res)
    
    # Limpeza antes do próximo modelo
    del model, tokenizer, loaded
    gc.collect()
    print("Limpeza de memória concluída. Iniciando próximo modelo...\n")


==== Modelo: meta-llama/Llama-3.2-1B ====
Carregando modelo...
Tokenizando prompt...
Executando teste...
 Run 0/4...
 Run 1/4...
 Run 2/4...
 Run 3/4...
 Run 4/4...
Limpeza de memória concluída. Iniciando próximo modelo...


==== Modelo: meta-llama/Llama-3.1-8B ====
Carregando modelo...
Erro ao carregar meta-llama/Llama-3.1-8B: [enforce fail at alloc_cpu.cpp:119] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 32121044992 bytes. Error code 12 (Cannot allocate memory)


==== Modelo: meta-llama/Llama-2-7b-hf ====
Carregando modelo...
Erro ao carregar meta-llama/Llama-2-7b-hf: [enforce fail at alloc_cpu.cpp:119] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 26953662464 bytes. Error code 12 (Cannot allocate memory)


==== Modelo: meta-llama/Llama-2-13b-hf ====
Carregando modelo...
Erro ao carregar meta-llama/Llama-2-13b-hf: [enforce fail at alloc_cpu.cpp:119] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to alloca

## Analyse Results

In [10]:
# Transforma os resultados em um DataFrame

df_load = pd.DataFrame(load_results)
df_inf  = pd.DataFrame(inf_results)

summary_load = (
    df_load
    .groupby("model")
    .agg(
        runs                       = ("model",                  "count"),
        avg_load_time_s            = ("load_time_s",            "mean"),
        avg_load_peak_cpu_proc_pct = ("load_peak_cpu_proc_pct", "mean"),
        avg_load_peak_ram_proc_mb  = ("load_peak_ram_proc_mb",  "mean"),
        avg_load_peak_cpu_sys_pct  = ("load_peak_cpu_sys_pct",  "mean"),
        avg_load_peak_ram_sys_mb   = ("load_peak_ram_sys_mb",   "mean"),
    )
    .reset_index()
)

summary_inf = (
    df_inf
    .groupby("model")
    .agg(
        runs                      = ("run",                   "count"),
        avg_latency_ms_per_token  = ("latency_ms_per_token",  "mean"),
        std_latency_ms_per_token  = ("latency_ms_per_token",  "std"),
        avg_throughput_tps        = ("throughput_tps",        "mean"),
        std_throughput_tps        = ("throughput_tps",        "std"),
        avg_inf_peak_cpu_proc_pct = ("inf_peak_cpu_proc_pct", "mean"),
        std_inf_peak_cpu_proc_pct = ("inf_peak_cpu_proc_pct", "std"),
        avg_inf_peak_ram_proc_mb  = ("inf_peak_ram_proc_mb",  "mean"),
        std_inf_peak_ram_proc_mb  = ("inf_peak_ram_proc_mb",  "std"),
        avg_inf_peak_cpu_sys_pct  = ("inf_peak_cpu_sys_pct",  "mean"),
        std_inf_peak_cpu_sys_pct  = ("inf_peak_cpu_sys_pct",  "std"),
        avg_inf_peak_ram_sys_mb   = ("inf_peak_ram_sys_mb",   "mean"),
        std_inf_peak_ram_sys_mb   = ("inf_peak_ram_sys_mb",   "std"),
    )
    .reset_index()
)

In [11]:
# Exporta os resultados para CSV

df_load.to_csv("../../data/raw/benchmarks/benchmark_carregamento.csv", index=False)
df_inf.to_csv("../../data/raw/benchmarks/benchmark_inferencia.csv", index=False)
summary_inf.to_csv("../../data/raw/benchmarks/benchmark_inferencia_resumo.csv", index=False)
summary_load.to_csv("../../data/raw/benchmarks/benchmark_carregamento_resumo.csv", index=False)