In [14]:
FOLD = 4

# Treinamento

## Etapa I (Bert -> Matéria)

In [None]:
import pandas as pd
import torch
import numpy as np

import torch.nn.functional as F

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import compute_class_weight

from transformers import BertForSequenceClassification, BertTokenizer
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


class QuestoesDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.input_ids = torch.tensor(df["input_ids"].tolist())
        self.attention_mask = torch.tensor(df["attention_mask"].tolist())
        self.labels = torch.tensor(df["label"].tolist())

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

    def __len__(self):
        return len(self.labels)

for FOLD in [3, 4]:
    train_df = pd.read_parquet(f"../data/estrato/train_fold_{FOLD}.parquet", engine="fastparquet")
    test_df = pd.read_parquet(f"../data/estrato/test_fold_{FOLD}.parquet", engine="fastparquet")
    train_df["label"] = train_df["materia"].astype("category").cat.codes
    test_df["label"] = test_df["materia"].astype("category").cat.codes

    label2id = {
        v: k
        for k, v in dict(
            enumerate(train_df["materia"].astype("category").cat.categories)
        ).items()
    }
    id2label = dict(enumerate(train_df["materia"].astype("category").cat.categories))

    train_dataset = QuestoesDataset(train_df)
    test_dataset = QuestoesDataset(test_df)


    model = BertForSequenceClassification.from_pretrained(
        "neuralmind/bert-base-portuguese-cased",
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id,
    )

    tokenizer = BertTokenizer.from_pretrained("./tokenizer-custom")
    model.resize_token_embeddings(len(tokenizer))


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        acc = accuracy_score(labels, predictions)
        f1_macro = f1_score(labels, predictions, average="macro")
        f1_weighted = f1_score(labels, predictions, average="weighted")
        recall = recall_score(labels, predictions, average="macro", zero_division=0)
        precision = precision_score(labels, predictions, average="macro", zero_division=0)

        return {
            "accuracy": acc,
            "macro-precision": precision,
            "macro-recall": recall,
            "macro-f1": f1_macro,
            "weighted-f1": f1_weighted,
        }


    # Cálculo dos pesos das classes
    class_weights = compute_class_weight(
        class_weight="balanced", classes=np.unique(train_df["label"]), y=train_df["label"]
    )

    class_weights = torch.tensor(class_weights, dtype=torch.float)


    def weighted_ce_loss(outputs, labels, num_items_in_batch: int):
        logits = outputs.logits
        weight = class_weights.to(logits.device)  # type: ignore
        return F.cross_entropy(logits, labels, weight=weight)


    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_threshold=0.005, early_stopping_patience=3
    )


    training_args = TrainingArguments(
        output_dir="./modelos-treinamento",  # Pasta para salvar o modelo
        eval_strategy="epoch",  # Avalia o modelo a cada época
        save_strategy="epoch",  # Salva o modelo a cada época
        save_total_limit=2,  # Mantém apenas os 2 últimos modelos salvos
        num_train_epochs=10,  # Número de épocas
        per_device_train_batch_size=12,  # Tamanho do batch de treino. Diminua se tiver erro de memória (ex: 4)
        per_device_eval_batch_size=24,  # Tamanho do batch de avaliação
        warmup_ratio=0.1,  # Passos de aquecimento do otimizador
        weight_decay=0.01,  # Regularização
        learning_rate=4e-5,  # Taxa de aprendizado
        logging_dir="./logs",  # Pasta para logs
        logging_steps=100,  # Exibe o progresso a cada 100 passos
        load_best_model_at_end=True,  # Carrega o melhor modelo no final do treino
        metric_for_best_model="macro-f1",  # Usa a macro-f1 para decidir o melhor modelo
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        compute_loss_func=weighted_ce_loss,
        callbacks=[early_stopping_callback],
    )

    trainer.train()

    scores = trainer.evaluate()

    for score_name, score_value in scores.items():
        print(f"{score_name}: {score_value}")

    trainer.save_model(f"./modelos/bert/materias_fold_{FOLD}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Epoch,Training Loss,Validation Loss,Accuracy,Macro-precision,Macro-recall,Macro-f1,Weighted-f1
1,0.4571,0.416449,0.881259,0.834534,0.831361,0.828828,0.880699
2,0.3834,0.406829,0.899403,0.847582,0.860904,0.853,0.90028
3,0.3114,0.490961,0.897336,0.849825,0.835965,0.841553,0.896961
4,0.1752,0.515963,0.903767,0.87007,0.835652,0.848262,0.901156
5,0.1947,0.535155,0.903537,0.850867,0.864284,0.856696,0.903799


eval_loss: 0.5351553559303284
eval_accuracy: 0.9035369774919614
eval_macro-precision: 0.8508674107792381
eval_macro-recall: 0.8642840751060729
eval_macro-f1: 0.8566960360163511
eval_weighted-f1: 0.9037991243161487
eval_runtime: 34.5712
eval_samples_per_second: 125.943
eval_steps_per_second: 5.265
epoch: 5.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro-precision,Macro-recall,Macro-f1,Weighted-f1
1,0.4994,0.445913,0.882637,0.825764,0.830311,0.821135,0.884236
2,0.3876,0.400065,0.901011,0.860869,0.819026,0.829589,0.897308
3,0.2801,0.385713,0.914102,0.868376,0.869521,0.867951,0.914702
4,0.2641,0.46296,0.907901,0.853693,0.870202,0.860574,0.909059
5,0.1112,0.498715,0.918466,0.877205,0.872349,0.874045,0.918695
6,0.1317,0.52808,0.90813,0.86161,0.859343,0.856836,0.908628
7,0.1145,0.540322,0.914561,0.871257,0.869565,0.869657,0.914736
8,0.0776,0.550821,0.916399,0.877455,0.849887,0.861296,0.915232


eval_loss: 0.49871519207954407
eval_accuracy: 0.9184657785943959
eval_macro-precision: 0.8772050782796538
eval_macro-recall: 0.8723488670329181
eval_macro-f1: 0.8740451324078595
eval_weighted-f1: 0.9186945698805227
eval_runtime: 34.6397
eval_samples_per_second: 125.694
eval_steps_per_second: 5.254
epoch: 8.0


## Etapa 2

### Clássicos

In [None]:
import pickle
import pandas as pd

from concurrent.futures import ProcessPoolExecutor
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline


# Carregando dados
train_df = pd.read_parquet(f"../data/estrato/train_fold_{FOLD}.parquet", engine="fastparquet")

# Dicionário de modelos
models = {
    "svc": LinearSVC(C=0.316, class_weight="balanced", random_state=42),
    "lr": LogisticRegression(
        C=10.0, class_weight="balanced", solver="lbfgs", random_state=42, max_iter=1000
    ),
    "rf": RandomForestClassifier(
        n_estimators=800,
        class_weight="balanced_subsample",
        max_depth=30,
        random_state=42,
    ),
    "nb": MultinomialNB(alpha=0.01),
}

# Vetorização e seleção de features
vectorizer = TfidfVectorizer(
    max_features=None,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.8,
)

selector = SelectPercentile(chi2, percentile=30)

# Pipeline base
base_pipeline = Pipeline(
    [
        ("tfidf", vectorizer),
        ("chi2", selector),
        ("clf", None),
    ]
)

materias = train_df["materia"].unique()


# Função auxiliar que treina e salva um modelo
def treinar_e_salvar(name, model, materia):
    x = train_df[train_df["materia"] == materia]["texto_lem"]
    y = train_df[train_df["materia"] == materia]["topico"]

    pipeline = base_pipeline.set_params(clf=model)
    pipeline.fit(x, y)

    filename = f"modelos/classicos/{name}_{materia}_fold_{FOLD}.pkl"
    with open(filename, "wb") as f:
        pickle.dump(pipeline, f)
    return filename


# Paralelizando
tasks = []
with ProcessPoolExecutor() as executor:
    for name, model in models.items():
        for materia in materias:
            tasks.append(executor.submit(treinar_e_salvar, name, model, materia))

    for future in tasks:
        print("Salvo:", future.result())

Salvo: modelos/classicos/svc_matematica_fold_4.pkl
Salvo: modelos/classicos/svc_biologia_fold_4.pkl
Salvo: modelos/classicos/svc_quimica_fold_4.pkl
Salvo: modelos/classicos/svc_fisica_fold_4.pkl
Salvo: modelos/classicos/svc_historia_fold_4.pkl
Salvo: modelos/classicos/svc_portugues_fold_4.pkl
Salvo: modelos/classicos/svc_literatura_fold_4.pkl
Salvo: modelos/classicos/svc_artes_fold_4.pkl
Salvo: modelos/classicos/svc_idiomas_fold_4.pkl
Salvo: modelos/classicos/svc_filosofia_fold_4.pkl
Salvo: modelos/classicos/svc_sociologia_fold_4.pkl
Salvo: modelos/classicos/svc_geografia_fold_4.pkl
Salvo: modelos/classicos/lr_matematica_fold_4.pkl
Salvo: modelos/classicos/lr_biologia_fold_4.pkl
Salvo: modelos/classicos/lr_quimica_fold_4.pkl
Salvo: modelos/classicos/lr_fisica_fold_4.pkl
Salvo: modelos/classicos/lr_historia_fold_4.pkl
Salvo: modelos/classicos/lr_portugues_fold_4.pkl
Salvo: modelos/classicos/lr_literatura_fold_4.pkl
Salvo: modelos/classicos/lr_artes_fold_4.pkl
Salvo: modelos/classicos/lr

### Transformers

In [None]:
import os
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    BertForSequenceClassification,
    BertTokenizer,
)

# ========= CONFIGURAÇÕES ========= #
BASE_MODEL = "adalbertojunior/distilbert-portuguese-cased"  # neuralmind/bert-base-portuguese-cased
TOKENIZER_PATH = "./tokenizer-custom"
OUTPUT_DIR = "./modelos/distilbert"
LOG_DIR = "./logs"

NUM_EPOCHS = 20
BATCH_TRAIN = 12
BATCH_EVAL = 24
WARMUP_RATIO = 0.1
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
EARLY_STOP_PATIENCE = 4
EARLY_STOP_DELTA = 0.005

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

for FOLD in [3, 4]:
    print(f"\n\n================ FOLD {FOLD} ================\n\n")
    for MODEL, OUTPUT_DIR in [
        ("adalbertojunior/distilbert-portuguese-cased", "./modelos/distilbert"),
    ]: 
        print(f"\n\n================ MODEL {MODEL} ================\n\n")
        # ========= DADOS ========= #
        train_df = pd.read_parquet(
            f"../data/estrato/train_fold_{FOLD}.parquet", engine="fastparquet"
        )
        test_df = pd.read_parquet(
            f"../data/estrato/test_fold_{FOLD}.parquet", engine="fastparquet"
        )

        # codifica rótulos de tópicos
        train_df["label"] = train_df["topico"].astype("category").cat.codes
        test_df["label"] = test_df["topico"].astype("category").cat.codes

        materias = train_df["materia"].unique()

        label2id = {
            v: k
            for k, v in dict(
                enumerate(train_df["topico"].astype("category").cat.categories)
            ).items()
        }
        id2label = dict(enumerate(train_df["topico"].astype("category").cat.categories))

        # ========= TOKENIZER ========= #
        tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

        # ========= DATASET ========= #
        class QuestoesDataset(torch.utils.data.Dataset):
            """Dataset simples para treino e avaliação."""

            def __init__(self, df):
                self.input_ids = torch.tensor(
                    df["input_ids"].tolist(), dtype=torch.long
                )
                self.attention_mask = torch.tensor(
                    df["attention_mask"].tolist(), dtype=torch.long
                )
                self.labels = torch.tensor(df["label"].tolist(), dtype=torch.long)

            def __getitem__(self, idx):
                return {
                    "input_ids": self.input_ids[idx],
                    "attention_mask": self.attention_mask[idx],
                    "labels": self.labels[idx],
                }

            def __len__(self):
                return len(self.labels)

        # ========= MÉTRICAS ========= #
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            preds = np.argmax(logits, axis=-1)

            acc = accuracy_score(labels, preds)
            f1_macro = f1_score(labels, preds, average="macro")
            f1_weighted = f1_score(labels, preds, average="weighted")
            recall = recall_score(labels, preds, average="macro", zero_division=0)
            precision = precision_score(labels, preds, average="macro", zero_division=0)

            return {
                "accuracy": acc,
                "macro-precision": precision,
                "macro-recall": recall,
                "macro-f1": f1_macro,
                "weighted-f1": f1_weighted,
            }

        # ========= CALLBACK DE EARLY STOPPING ========= #
        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_threshold=EARLY_STOP_DELTA,
            early_stopping_patience=EARLY_STOP_PATIENCE,
        )

        # ========= CONFIGURAÇÕES DO TREINAMENTO ========= #
        training_args = TrainingArguments(
            output_dir="./modelos-treinamento",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=BATCH_TRAIN,
            per_device_eval_batch_size=BATCH_EVAL,
            warmup_ratio=WARMUP_RATIO,
            weight_decay=WEIGHT_DECAY,
            logging_dir=LOG_DIR,
            logging_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="macro-f1",
        )

        # ========= FUNÇÃO DE TREINAMENTO ========= #
        def treinar_e_salvar_modelo(materia: str):
            print(f"\n=== Treinando modelo para a matéria: {materia} ===")

            # filtra datasets
            train_mat = train_df[train_df["materia"] == materia].copy()
            test_mat = test_df[test_df["materia"] == materia].copy()

            train_mat["label"] = train_mat["topico"].astype("category").cat.codes
            test_mat["label"] = test_mat["topico"].astype("category").cat.codes

            label2id = {
                v: k
                for k, v in dict(
                    enumerate(train_mat["topico"].astype("category").cat.categories)
                ).items()
            }
            id2label = dict(
                enumerate(train_mat["topico"].astype("category").cat.categories)
            )

            # cria datasets
            train_dataset = QuestoesDataset(train_mat)
            test_dataset = QuestoesDataset(test_mat)

            # peso das classes específico para a matéria
            class_weights = compute_class_weight(
                class_weight="balanced",
                classes=np.unique(train_mat["label"]),
                y=train_mat["label"],
            )
            class_weights = torch.tensor(class_weights, dtype=torch.float)

            def weighted_ce_loss(outputs, labels, num_items_in_batch: int):
                """Cross-entropy ponderada pelas frequências das classes."""
                logits = outputs.logits
                weights = class_weights.to(logits.device)
                return F.cross_entropy(logits, labels, weight=weights)

            # cria modelo novo para cada matéria (sem compartilhar pesos de saída)
            model = BertForSequenceClassification.from_pretrained(
                BASE_MODEL,
                num_labels=len(id2label),
                id2label=id2label,
                label2id=label2id,
            )
            model.resize_token_embeddings(len(tokenizer))

            # instancia trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                compute_metrics=compute_metrics,
                compute_loss_func=weighted_ce_loss,
                callbacks=[early_stopping_callback],
            )

            # treino e avaliação
            trainer.train()
            scores = trainer.evaluate()
            print(f"Resultados ({materia}): {scores}")

            # salva modelo
            save_path = os.path.join(OUTPUT_DIR, f"{materia}_fold_{FOLD}")
            os.makedirs(save_path, exist_ok=True)
            trainer.save_model(save_path)
            print(f"✅ Modelo salvo em: {save_path}")

            return save_path

        # ========= LOOP DE TREINAMENTO ========= #
        for materia in materias:
            treinar_e_salvar_modelo(materia)

# Inferência

## Etapa I (Bert -> Matéria)

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoConfig, AutoModelForSequenceClassification

BATCH = 24
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHECKPOINT_PATH = f"./modelos/bert/materias_fold_{FOLD}"

# ======== Carregar dados ========
test_df = pd.read_parquet(f"../data/estrato/test_fold_{FOLD}.parquet", engine="fastparquet")


# ======== Dataset ========
class QuestoesDataset(Dataset):
    def __init__(self, df):
        self.input_ids = torch.tensor(df["input_ids"].tolist(), dtype=torch.long)
        self.attention_mask = torch.tensor(
            df["attention_mask"].tolist(), dtype=torch.long
        )

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }

    def __len__(self):
        return len(self.input_ids)


# ======== Carregar modelo ========
def load_model(checkpoint_path: str):
    config = AutoConfig.from_pretrained(checkpoint_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path, config=config
    )
    model.to(DEVICE)
    model.eval()
    id2label = {int(k): v for k, v in getattr(model.config, "id2label", {}).items()}
    return model, id2label


model, id2label = load_model(CHECKPOINT_PATH)

# ======== Inferência ========
test_dataset = QuestoesDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=BATCH, shuffle=False)

preds = []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())

y_pred = np.array(preds)
y_pred_labels = [id2label[p] for p in y_pred]

# ======== Salvar resultados ========
results_path = f"./results/predictions_{FOLD}.csv"
if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
else:
    results_df = test_df[["id", "materia", "topico"]].copy()

# Atualiza resultados
results_df.loc[results_df["id"].isin(test_df["id"]), "bert_materia_pred"] = (
    y_pred_labels
)
results_df.to_csv(results_path, index=False)

print(f"✅ Predições salvas em {results_path}")

✅ Predições salvas em ./results/predictions_4.csv


In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoConfig, AutoModelForSequenceClassification

FOLD = 1
BATCH = 24
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHECKPOINT_PATH = f"./modelos/distilbert/diretao_fold_{FOLD}"

# ======== Carregar dados ========
test_df = pd.read_parquet(f"../data/estrato/test_fold_{FOLD}.parquet", engine="fastparquet")


# ======== Dataset ========
class QuestoesDataset(Dataset):
    def __init__(self, df):
        self.input_ids = torch.tensor(df["input_ids"].tolist(), dtype=torch.long)
        self.attention_mask = torch.tensor(
            df["attention_mask"].tolist(), dtype=torch.long
        )

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }

    def __len__(self):
        return len(self.input_ids)


# ======== Carregar modelo ========
def load_model(checkpoint_path: str):
    config = AutoConfig.from_pretrained(checkpoint_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path, config=config
    )
    model.to(DEVICE)
    model.eval()
    id2label = {int(k): v for k, v in getattr(model.config, "id2label", {}).items()}
    return model, id2label


model, id2label = load_model(CHECKPOINT_PATH)

# ======== Inferência ========
test_dataset = QuestoesDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=BATCH, shuffle=False)

preds = []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())

y_pred = np.array(preds)
y_pred_labels = [id2label[p] for p in y_pred]

# ======== Salvar resultados ========
results_path = f"./results/predictions_diretao_{FOLD}.csv"
if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
else:
    results_df = test_df[["id", "materia", "topico"]].copy()

# Atualiza resultados
results_df.loc[results_df["id"].isin(test_df["id"]), "distilbert_pred"] = (
    y_pred_labels
)
results_df.to_csv(results_path, index=False)

print(f"✅ Predições salvas em {results_path}")

✅ Predições salvas em ./results/predictions_diretao_1.csv


## Etapa II

### Clássicos

In [15]:
import os
import pickle
from pathlib import Path
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed

# === Configurações ===
ORACLE = False  # Se True, usa a matéria correta em vez da predita pelo BERT
DATA_DIR = Path("./data")
MODEL_DIR = Path("./modelos/classicos")
RESULTS_DIR = Path("./results")

# === Carregando dados ===
test_path = DATA_DIR / f"test_fold_{FOLD}.parquet"
test_df = pd.read_parquet(test_path, engine="fastparquet")

if not ORACLE:
    df_results = pd.read_csv(RESULTS_DIR / ("oracle") / f"predictions_{FOLD}.csv")
    test_df["bert_materia_pred"] = df_results["bert_materia_pred"]

# === Modelos a usar ===
model_names = ["svc", "lr", "rf", "nb"]


# === Função auxiliar para rodar cada modelo ===
def run_model(
    model_name: str,
    fold: int,
    model_dir: Path,
    test_df: pd.DataFrame,
    oracle: bool = False,
):
    """Carrega os modelos especializados e gera previsões para um modelo específico."""
    print(f"=== Iniciando processamento do modelo: {model_name} ===")

    # Carrega os submodelos por matéria
    model_group = {}
    for materia in test_df["materia"].unique():
        model_path = model_dir / f"{model_name}_{materia}_fold_{fold}.pkl"
        with open(model_path, "rb") as f:
            model_group[materia] = pickle.load(f)

    preds = []
    for _, row in test_df.iterrows():
        if oracle:
            materia = row["materia"]
        else:
            materia = row["bert_materia_pred"]

        text = row["texto_lem"]
        model = model_group[materia]
        pred = model.predict([text])[0]
        preds.append(pred)

    print(f"✓ Modelo {model_name} finalizado.")
    return model_name, preds


# === Carregar ou inicializar DataFrame de resultados ===
if ORACLE:
    results_path = f"./results/oracle/predictions_{FOLD}.csv"
else:
    results_path = f"./results/real/predictions_{FOLD}.csv"

if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
else:
    results_df = test_df[["id", "materia", "topico"]].copy()

# === Execução paralela ===
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = {
        executor.submit(run_model, model_name, FOLD, MODEL_DIR, test_df): model_name
        for model_name in model_names
    }

    for future in as_completed(futures):
        model_name, preds = future.result()
        results_df[f"{model_name}_topico_pred"] = preds

# === Salvando previsões ===
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
results_df.to_csv(results_path, index=False)

print(f"\n✅ Resultados salvos em: {results_path}")

=== Iniciando processamento do modelo: svc ===
=== Iniciando processamento do modelo: lr ===
=== Iniciando processamento do modelo: rf ===
=== Iniciando processamento do modelo: nb ===
✓ Modelo svc finalizado.
✓ Modelo lr finalizado.
✓ Modelo nb finalizado.
✓ Modelo rf finalizado.

✅ Resultados salvos em: ./results/real/predictions_4.csv


### Transformers

In [16]:
import os
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoConfig

# ========= CONFIG =========
ORACLE = False  # Se True, usa a matéria correta em vez da predita pelo BERT
BATCH_SIZE = 24
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODELS = ["bert", "distilbert"]

# ========= CARREGAR DADOS =========
test_df = pd.read_parquet(f"./data/test_fold_{FOLD}.parquet", engine="fastparquet")

if not ORACLE:
    df_results = pd.read_csv(RESULTS_DIR / ("oracle") / f"predictions_{FOLD}.csv")
    test_df["bert_materia_pred"] = df_results["bert_materia_pred"]

# Mapeia os checkpoints
transformer_checkpoints = {}
for model in MODELS:
    transformer_checkpoints[model] = {}
    for materia in test_df["materia"].unique():
        path = f"./modelos/{model}/{materia}_fold_{FOLD}"
        if not os.path.exists(path):
            raise ValueError(f"Checkpoint não encontrado: {path}")
        transformer_checkpoints[model][materia] = path

# Prepara arquivo de resultados
if ORACLE:
    results_path = f"./results/oracle/predictions_{FOLD}.csv"
else:
    results_path = f"./results/real/predictions_{FOLD}.csv"

if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
else:
    base_cols = ["id", "materia"]
    if "topico" in test_df.columns:
        base_cols.append("topico")
    results_df = test_df[base_cols].copy()


# ========= DATASET =========
class QuestoesDataset(Dataset):
    """Dataset simples de inferência (já com padding fixo)."""

    def __init__(self, df):
        self.input_ids = torch.tensor(df["input_ids"].tolist(), dtype=torch.long)
        self.attention_mask = torch.tensor(
            df["attention_mask"].tolist(), dtype=torch.long
        )

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }


# ========= FUNÇÕES AUXILIARES =========
def load_model(checkpoint_path: str):
    """Carrega modelo e id2label do checkpoint."""
    config = AutoConfig.from_pretrained(checkpoint_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path, config=config
    )
    model.to(DEVICE)
    model.eval()
    id2label = {int(k): v for k, v in getattr(model.config, "id2label", {}).items()}
    return model, id2label


@torch.no_grad()
def infer_model(model, dataset: Dataset) -> np.ndarray:
    """Executa inferência em lote."""
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    preds = []
    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds.append(torch.argmax(logits, dim=-1).cpu().numpy())
    return np.concatenate(preds)


# ========= LOOP PRINCIPAL =========
for model_name, materia_ckpts in transformer_checkpoints.items():
    print(f"\n=== Processando modelo Transformer: {model_name} ===")

    # Garante coluna para o modelo no CSV
    if model_name not in results_df.columns:
        results_df[f"{model_name}_topico_pred"] = None

    materias_no_teste = test_df["materia"].unique().tolist()

    # Verifica se há checkpoints para todas as matérias
    faltantes = [m for m in materias_no_teste if m not in materia_ckpts]
    if faltantes:
        raise ValueError(
            f"Faltam checkpoints para as matérias {faltantes} no modelo '{model_name}'."
        )

    for materia in materias_no_teste:
        print(f"  - Matéria: {materia}")
        if ORACLE:
            df_mat = test_df[test_df["materia"] == materia].copy()
        else:
            df_mat = test_df[test_df["bert_materia_pred"] == materia].copy()

        # Carrega modelo específico
        checkpoint = materia_ckpts[materia]
        model, id2label = load_model(checkpoint)

        # Inferência
        ds = QuestoesDataset(df_mat)
        y_pred_ids = infer_model(model, ds)

        # Converte IDs em labels (caso existam no config)
        if id2label:
            y_pred_labels = [id2label.get(int(i), str(int(i))) for i in y_pred_ids]
        else:
            y_pred_labels = y_pred_ids.tolist()

        # Atualiza resultados
        results_df.loc[
            results_df["id"].isin(df_mat["id"]), f"{model_name}_topico_pred"
        ] = y_pred_labels

# ========= SALVAR =========
# os.makedirs("./results", exist_ok=True)
results_df.to_csv(results_path, index=False)
print(f"\n✅ Previsões salvas em: {results_path}")


=== Processando modelo Transformer: bert ===
  - Matéria: matematica


  - Matéria: biologia
  - Matéria: quimica
  - Matéria: fisica
  - Matéria: historia
  - Matéria: portugues
  - Matéria: literatura
  - Matéria: artes
  - Matéria: idiomas
  - Matéria: filosofia
  - Matéria: sociologia
  - Matéria: geografia

=== Processando modelo Transformer: distilbert ===
  - Matéria: matematica
  - Matéria: biologia
  - Matéria: quimica
  - Matéria: fisica
  - Matéria: historia
  - Matéria: portugues
  - Matéria: literatura
  - Matéria: artes
  - Matéria: idiomas
  - Matéria: filosofia
  - Matéria: sociologia
  - Matéria: geografia

✅ Previsões salvas em: ./results/real/predictions_4.csv
