# Notebook Description

- 

## Load Libraries

In [None]:
import os, time, threading, psutil, gc, dotenv
from typing import List, Dict, Any

import pandas as pd

from huggingface_hub import HfFolder
from datasets import load_dataset

from peft import LoraConfig, get_peft_model, TaskType

## Global Settings

In [None]:
# Configura o ambiente para CPU

os.environ["OMP_NUM_THREADS"] = "18"  # Número de threads para OpenMP
os.environ["MKL_NUM_THREADS"] = "18"  # Número de threads para MKL
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Garante que os processos não usem GPU

import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    logging,
)

torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"]))

print("CUDA disponível?", torch.cuda.is_available())

In [None]:
# Suprime verbose do Transformers, PyTorch e Hugging Face

# Desabilita as barras de progresso do Hugging Face
# os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
# logging.set_verbosity_error()
# logging.disable_progress_bar()

In [None]:
# Configura o token do huggingface

# Carrega as variáveis de ambiente do arquivo .env
dotenv.load_dotenv("../../env/.env")
access_token = os.getenv("HF_TOKEN")
if not access_token:
    raise RuntimeError("HF_TOKEN não definido!")

# Salva o token no cache do huggingface
HfFolder.save_token(access_token)

In [None]:
# Configura o fine-tuning do modelo

# Nome do modelo base
MODEL_NAMES = [
    "meta-llama/Llama-3.2-1B",
]

# Parâmetros do fine-tuning
BATCH_SIZE = 8
EPOCHS = 1
BLOCK_SIZE = 128
SAMPLE_INTERVAL = 0.1

## Resource Monitoring Module

In [None]:
class ResourceMonitor:
    def __init__(self, interval: float = 0.1):
        self.interval = interval
        self._stop_event = threading.Event()
        self._thread = threading.Thread(target=self._run, daemon=True)
        self.samples: List[Dict[str, Any]] = []
        self._baseline_mem_mb = 0.0

    def _run(self):
        proc = psutil.Process()
        while not self._stop_event.is_set():
            # Por processo
            raw_mem = proc.memory_info().rss / 1024**2
            mem_proc = max(raw_mem - self._baseline_mem_mb, 0.0)
            cpu_proc = proc.cpu_percent(None)
            timestamp = time.perf_counter()
            
            self.samples.append(
                {
                    "time": timestamp,
                    "cpu_proc_pct": cpu_proc,
                    "mem_proc_mb": mem_proc,
                }
            )
            time.sleep(self.interval)

    def start(self):
        self.samples = []
        self._stop_event.clear()
        proc = psutil.Process()
        proc.cpu_percent(None)
        self._baseline_mem_mb = proc.memory_info().rss / 1024**2
        self._thread = threading.Thread(target=self._run, daemon=True)
        self._thread.start()

    def stop(self):
        self._stop_event.set()
        if self._thread is not None:
            self._thread.join()
        return self.samples

## Data Preparation

In [None]:
# Carrega os dados do dataset

raw_datasets = load_dataset("yale-lily/aeslc", split="train")
raw_datasets = raw_datasets.rename_column("email_body", "text")

In [None]:
# Funções de pré-processamento dos dados

def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, max_length=BLOCK_SIZE)


def group_texts(examples):
    concatenated = []
    for ids in examples["input_ids"]:
        concatenated.extend(ids)
    total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        "input_ids": [
            concatenated[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)
        ]
    }
    result["attention_mask"] = [
        [1] * BLOCK_SIZE for _ in range(len(result["input_ids"]))
    ]
    return result

## Fine-Tuning

In [None]:
def fine_tune_model(
    model_name: str,
    dataset,
    epochs: int = EPOCHS,
    batch_size: int = BATCH_SIZE,
    block_size: int = BLOCK_SIZE,
    sample_interval: float = SAMPLE_INTERVAL,
) -> dict:
    # Carrega tokenizer e modelo
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, token=access_token, padding_side="right", truncation_side="right"
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name, use_cache=False, token=access_token, torch_dtype=torch.float16, device_map="cpu", low_cpu_mem_usage=True
    )
    
    # model = torch.quantization.quantize_dynamic(
    #     model, {torch.nn.Linear}, dtype=torch.qint8
    # )
    
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )
    
    model = get_peft_model(model, lora_config)

    # Tokeniza e agrupa datasets
    tokenized = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text"])
    
    if len(tokenized) == 0:
        raise ValueError("Dataset vazio após tokenização!")
    
    lm_datasets = tokenized.map(
        group_texts,
        batched=True,
        batch_size=len(tokenized),
        remove_columns=tokenized.column_names,
    )
    
    print(f"Total de amostras: {len(lm_datasets)}")
    print(f"Amostra de dados: {lm_datasets[0]}")
    print(f"Exemplo de tokenização: {tokenizer.decode(lm_datasets[0]['input_ids'])}")

    # DataCollator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # TrainingArguments
    output_dir = f"output/{model_name.replace('/', '_')}_finetuned_enron"
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        save_steps=10_000,
        save_total_limit=1,
        logging_steps=500,
        logging_dir=f"logs/{model_name.replace('/', '_')}_enron",
        no_cuda=True,
        fp16=True,
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=lm_datasets,
    )

    # Inicia monitor e tempo
    monitor = ResourceMonitor(interval=sample_interval)
    monitor.start()
    t0 = time.perf_counter()

    # Execução do treinamento
    train_result = trainer.train()

    # Para monitoramento
    samples = monitor.stop()
    total_time = time.perf_counter() - t0
    cpu_peak = max(s["cpu_proc_pct"] for s in samples) if samples else 0.0
    mem_peak = max(s["mem_proc_mb"] for s in samples) if samples else 0.0

    # Limpa memória
    del model, tokenizer, trainer
    torch.cuda.empty_cache()
    gc.collect()

    return {
        "model": model_name,
        "train_time_s": total_time,
        "peak_cpu_proc_pct": cpu_peak,
        "peak_ram_proc_mb": mem_peak,
        "train_metrics": train_result.metrics,
    }

## Generate Benchmark

In [None]:
results = []

# print(torch.cuda.is_available())
# print(torch.cuda.current_device())    # índice da GPU em uso
# print(torch.cuda.memory_allocated())  # memória alocada em bytes
print(raw_datasets)

for model_name in MODEL_NAMES:
    print(f"\n---- Fine-tuning Enron: {model_name} ----")
    
    try:
        metrics = fine_tune_model(
            model_name=model_name,
            dataset=raw_datasets,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            block_size=BLOCK_SIZE,
            sample_interval=SAMPLE_INTERVAL,
        )
        results.append(metrics)
    except Exception as e:
        print(f"Erro ao treinar {model_name}: {e}")

## Analyse Results

In [None]:
df_results = pd.DataFrame(results)
os.makedirs("../../data/raw/ft_models", exist_ok=True)
df_results.to_csv("../../data/raw/ft_models/fine_tuning_enron_metrics.csv", index=False)