# Desafio Kaggle ‚Äî Classifica√ß√£o de Emo√ß√µes (Modelos Manuais)

> Notebook organizado para apresenta√ß√£o do trabalho da UC14, com modelos constru√≠dos manualmente (RNN/LSTM) em PyTorch + Hugging Face Trainer.

## Objetivo
- Treinar e avaliar um classificador de emo√ß√µes sem usar backbone pr√©-treinado de linguagem.
- Comparar arquiteturas manuais recorrentes (RNN e LSTM).
- Gerar submiss√£o Kaggle no formato esperado.

## Estrutura
1. Setup e imports
2. Defini√ß√£o do modelo manual (RNN/LSTM)
3. Pipeline de dados
4. Treinamento e m√©tricas
5. Avalia√ß√£o, infer√™ncia e submiss√£o

In [None]:
%pip install -q evaluate

In [None]:
import torch
import torch.nn as nn
from transformers import (
    PretrainedConfig,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    AutoTokenizer,
)
from datasets import load_dataset
from pathlib import Path
import numpy as np
import evaluate
import csv

def get_best_available_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    if torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

device = get_best_available_device()
print(f"Device selecionado: {device}")

## 1) Defini√ß√£o do Modelo Manual (RNN ou LSTM)

Escolha a arquitetura no par√¢metro `MODEL_TYPE` (`"rnn"` ou `"lstm"`).

In [None]:
# @title Arquiteturas manuais compat√≠veis com Hugging Face

MODEL_TYPE = "lstm"  # op√ß√µes: "rnn" ou "lstm"

class RNNConfig(PretrainedConfig):
    model_type = "custom_raw_rnn"

    def __init__(self, vocab_size=30522, embedding_dim=64, hidden_dim=128, n_classes=6, dropout=0.2, **kwargs):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes
        self.dropout = dropout
        super().__init__(**kwargs)

class LSTMConfig(PretrainedConfig):
    model_type = "custom_raw_lstm"

    def __init__(self, vocab_size=30522, embedding_dim=64, hidden_dim=128, n_classes=6, dropout=0.2, **kwargs):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes
        self.dropout = dropout
        super().__init__(**kwargs)

class VanillaRNNLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.Tanh()

    def forward(self, x, attention_mask=None):
        batch_size, seq_len, _ = x.size()
        h_t = x.new_zeros((batch_size, self.hidden_size))
        outputs = []

        for t in range(seq_len):
            x_t = x[:, t, :]
            next_h = self.activation(self.i2h(x_t) + self.h2h(h_t))
            if attention_mask is not None:
                mask_t = attention_mask[:, t].unsqueeze(1).type_as(next_h)
                h_t = mask_t * next_h + (1.0 - mask_t) * h_t
            else:
                h_t = next_h
            outputs.append(h_t)

        return torch.stack(outputs, dim=1)

class LSTMLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.x2h = nn.Linear(input_size, 4 * hidden_size)
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size)

    def forward(self, x, attention_mask=None):
        batch_size, seq_len, _ = x.size()
        h_t = x.new_zeros((batch_size, self.hidden_size))
        c_t = x.new_zeros((batch_size, self.hidden_size))
        outputs = []

        for t in range(seq_len):
            x_t = x[:, t, :]
            gates = self.x2h(x_t) + self.h2h(h_t)
            i_gate, f_gate, g_gate, o_gate = gates.chunk(4, dim=1)

            i_gate = torch.sigmoid(i_gate)
            f_gate = torch.sigmoid(f_gate)
            g_gate = torch.tanh(g_gate)
            o_gate = torch.sigmoid(o_gate)

            next_c = f_gate * c_t + i_gate * g_gate
            next_h = o_gate * torch.tanh(next_c)

            if attention_mask is not None:
                mask_t = attention_mask[:, t].unsqueeze(1).type_as(next_h)
                h_t = mask_t * next_h + (1.0 - mask_t) * h_t
                c_t = mask_t * next_c + (1.0 - mask_t) * c_t
            else:
                h_t = next_h
                c_t = next_c
            outputs.append(h_t)

        return torch.stack(outputs, dim=1)

class TextClassificationRawRNN(PreTrainedModel):
    config_class = RNNConfig

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim, padding_idx=0)
        self.rnn_block = VanillaRNNLayer(config.embedding_dim, config.hidden_dim)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_dim, config.n_classes)
        self.loss_fn = nn.CrossEntropyLoss()
        self.all_tied_weights_keys = []
        self._tied_weights_keys = []
        self.post_init()

    @property
    def dummy_inputs(self):
        return {"input_ids": torch.tensor([[0, 1]]), "attention_mask": torch.tensor([[1, 1]])}

    def _check_and_adjust_experts_implementation(self, experts_implementation):
        return experts_implementation

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        x = self.embedding(input_ids)
        rnn_output = self.rnn_block(x, attention_mask=attention_mask)

        if attention_mask is not None:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = input_ids.shape[0]
            last_hidden_states = rnn_output[torch.arange(batch_size, device=x.device), sequence_lengths]
        else:
            last_hidden_states = rnn_output[:, -1, :]

        logits = self.classifier(self.dropout(last_hidden_states))
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits}

class TextClassificationRawLSTM(PreTrainedModel):
    config_class = LSTMConfig

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim, padding_idx=0)
        self.lstm_block = LSTMLayer(config.embedding_dim, config.hidden_dim)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_dim, config.n_classes)
        self.loss_fn = nn.CrossEntropyLoss()
        self.all_tied_weights_keys = []
        self._tied_weights_keys = []
        self.post_init()

    @property
    def dummy_inputs(self):
        return {"input_ids": torch.tensor([[0, 1]]), "attention_mask": torch.tensor([[1, 1]])}

    def _check_and_adjust_experts_implementation(self, experts_implementation):
        return experts_implementation

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        x = self.embedding(input_ids)
        lstm_output = self.lstm_block(x, attention_mask=attention_mask)

        if attention_mask is not None:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = input_ids.shape[0]
            last_hidden_states = lstm_output[torch.arange(batch_size, device=x.device), sequence_lengths]
        else:
            last_hidden_states = lstm_output[:, -1, :]

        logits = self.classifier(self.dropout(last_hidden_states))
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits}

def build_manual_model(model_type, vocab_size, n_classes, embedding_dim=64, hidden_dim=128, dropout=0.2):
    if model_type.lower() == "rnn":
        config = RNNConfig(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, n_classes=n_classes, dropout=dropout)
        return TextClassificationRawRNN(config)

    if model_type.lower() == "lstm":
        config = LSTMConfig(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, n_classes=n_classes, dropout=dropout)
        return TextClassificationRawLSTM(config)

    raise ValueError("MODEL_TYPE deve ser 'rnn' ou 'lstm'.")

print(f"Arquitetura selecionada: {MODEL_TYPE.upper()}")

## 2) Pipeline de Dados

Utilizamos o dataset do desafio e um tokenizer apenas para mapeamento de vocabul√°rio e padding din√¢mico.

In [None]:
# @title Data Pipeline (Emotion CSV)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def resolve_train_csv(root: Path) -> Path:
    kaggle_path = Path("/kaggle/input/datasets/pablohenriquelemes/emotion-classification-uc14/train_large.csv")
    local_candidates = [
        root / "data" / "kaggle_emotion_classification" / "train_large.csv",
        root / "train_large.csv",
    ]
    if kaggle_path.exists():
        return kaggle_path
    for candidate in local_candidates:
        if candidate.exists():
            return candidate
    raise FileNotFoundError("N√£o foi poss√≠vel localizar train_large.csv.")

project_root = Path.cwd()
if not (project_root / "data").exists() and (project_root.parent / "data").exists():
    project_root = project_root.parent

train_csv = resolve_train_csv(project_root)
print(f"Arquivo de treino: {train_csv}")

raw_dataset = load_dataset("csv", data_files={"train": str(train_csv)})["train"]
raw_dataset = raw_dataset.class_encode_column("label")
dataset = raw_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="label")

num_labels = raw_dataset.features["label"].num_classes
print(f"Total de classes: {num_labels}")

train_labels_np = np.array(dataset["train"]["label"])
val_labels_np = np.array(dataset["test"]["label"])
train_counts = np.bincount(train_labels_np, minlength=num_labels)
val_counts = np.bincount(val_labels_np, minlength=num_labels)
print("Distribui√ß√£o de classes (train):", train_counts.tolist())
print("Distribui√ß√£o de classes (val):  ", val_counts.tolist())

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=128)

columns_to_remove = [col for col in dataset["train"].column_names if col != "label"]
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=columns_to_remove)

## 3) Treinamento e M√©tricas

M√©tricas monitoradas por √©poca: Training Loss, Validation Loss, Accuracy, Log Loss, Precision, Recall e F1 (weighted e macro).

In [None]:
# @title Training Setup (Manual RNN/LSTM)

is_cuda = torch.cuda.is_available()
is_mps = torch.backends.mps.is_available()
train_batch_size = 64

model = build_manual_model(
    model_type=MODEL_TYPE,
    vocab_size=tokenizer.vocab_size,
    n_classes=num_labels,
    embedding_dim=64,
    hidden_dim=128,
    dropout=0.2,
)

train_counts = np.bincount(np.array(tokenized_datasets["train"]["label"]), minlength=num_labels)
class_weights_np = train_counts.sum() / np.maximum(train_counts, 1)
class_weights_np = class_weights_np / class_weights_np.mean()
class_weights = torch.tensor(class_weights_np, dtype=torch.float)
print("Class weights:", class_weights_np.round(4).tolist())

def multiclass_log_loss(logits, labels, eps=1e-15):
    probs = np.exp(logits - logits.max(axis=1, keepdims=True))
    probs = probs / probs.sum(axis=1, keepdims=True)
    probs = np.clip(probs, eps, 1.0 - eps)
    one_hot = np.eye(probs.shape[1])[labels]
    return float(-(one_hot * np.log(probs)).sum(axis=1).mean())

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    ll = multiclass_log_loss(logits, labels)

    precision_weighted = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    recall_weighted = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]
    f1_weighted = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]

    precision_macro = precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"]
    recall_macro = recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"]
    f1_macro = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "log_loss": ll,
        "precision_weighted": precision_weighted,
        "recall_weighted": recall_weighted,
        "f1_weighted": f1_weighted,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
    }

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self._cached_cw = None

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.class_weights is not None:
            if self._cached_cw is None or self._cached_cw.device != logits.device:
                self._cached_cw = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=self._cached_cw)
        else:
            loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, model.config.n_classes), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

output_dir = f"./manual_{MODEL_TYPE}_emotion"
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-4,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=max(32, train_batch_size),
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_log_loss",
    greater_is_better=False,
    lr_scheduler_type="linear",
    warmup_steps=200,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=50,
    dataloader_num_workers=2,
    dataloader_pin_memory=False if is_mps else True,
    remove_unused_columns=True,
    report_to="none",
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    class_weights=class_weights,
)

print(f"Device profile -> CUDA: {is_cuda} | MPS: {is_mps}")
print(f"‚úÖ Modelo manual criado: {MODEL_TYPE.upper()}")

In [None]:
# @title Start training

trainer.model.to(device)
print(f"Treinando em: {device}")

if device.type == "mps" and hasattr(torch, "mps"):
    torch.mps.empty_cache()

trainer.train()

In [None]:
# @title Save trained model

save_directory = f"./manual_{MODEL_TYPE}_emotion/final_checkpoint"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Checkpoint salvo em: {save_directory}")

## 4) Avalia√ß√£o, Infer√™ncia e Submiss√£o Kaggle

In [None]:
# @title Load checkpoint treinado

project_root = Path.cwd()
if not (project_root / "data").exists() and (project_root.parent / "data").exists():
    project_root = project_root.parent

candidate_dirs = [
    project_root / f"manual_{MODEL_TYPE}_emotion" / "final_checkpoint",
    project_root / "notebooks" / f"manual_{MODEL_TYPE}_emotion" / "final_checkpoint",
]

save_directory = None
for c in candidate_dirs:
    if c.exists():
        save_directory = str(c)
        break

if save_directory is None:
    raise FileNotFoundError("Nenhum checkpoint final encontrado para o modelo manual selecionado.")

print(f"Carregando checkpoint de: {save_directory}")
if MODEL_TYPE.lower() == "rnn":
    loaded_model = TextClassificationRawRNN.from_pretrained(save_directory)
else:
    loaded_model = TextClassificationRawLSTM.from_pretrained(save_directory)
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
# @title Relat√≥rio completo de valida√ß√£o

eval_trainer = Trainer(
    model=loaded_model,
    args=TrainingArguments(output_dir="./eval_output_manual", report_to="none"),
    eval_dataset=tokenized_datasets["test"],
    processing_class=loaded_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

metrics = eval_trainer.evaluate()

print("\nüìä RELAT√ìRIO DE VALIDA√á√ÉO (MODELO MANUAL)")
print(f"Validation Loss        : {metrics['eval_loss']:.5f}")
print(f"Validation Log Loss    : {metrics['eval_log_loss']:.5f}")
print(f"Validation Accuracy    : {metrics['eval_accuracy']:.2%}")
print(f"Precision (weighted)   : {metrics['eval_precision_weighted']:.2%}")
print(f"Recall (weighted)      : {metrics['eval_recall_weighted']:.2%}")
print(f"F1-score (weighted)    : {metrics['eval_f1_weighted']:.2%}")
print(f"Precision (macro)      : {metrics['eval_precision_macro']:.2%}")
print(f"Recall (macro)         : {metrics['eval_recall_macro']:.2%}")
print(f"F1-score (macro)       : {metrics['eval_f1_macro']:.2%}")

In [None]:
# @title Infer√™ncia em texto novo

label_map = {0: "Sadness", 1: "Joy", 2: "Love", 3: "Anger", 4: "Fear", 5: "Surprise"}

loaded_model = loaded_model.to(device)
loaded_model.eval()

def predict_emotion(text):
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        predicted_class_id = torch.argmax(outputs["logits"], dim=-1).item()

    return label_map[predicted_class_id]

sample_text = "I am very proud of this IA project result."
print("Texto:", sample_text)
print("Emo√ß√£o prevista:", predict_emotion(sample_text))

In [None]:
# @title Gerar submiss√£o Kaggle

def _resolve_test_csv(root: Path) -> Path:
    kaggle_path = Path("/kaggle/input/datasets/pablohenriquelemes/emotion-classification-uc14/test.csv")
    local_candidates = [
        root / "data" / "kaggle_emotion_classification" / "test.csv",
        root / "test.csv",
    ]
    if kaggle_path.exists():
        return kaggle_path
    for candidate in local_candidates:
        if candidate.exists():
            return candidate
    raise FileNotFoundError("Arquivo de teste n√£o encontrado (test.csv).")

def _load_test_rows(csv_path: Path):
    with csv_path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        if not reader.fieldnames:
            raise ValueError("CSV de teste sem cabe√ßalho.")

        fieldnames = [name.strip() for name in reader.fieldnames]
        id_field = "id" if "id" in fieldnames else None
        text_field = "text" if "text" in fieldnames else None

        if text_field is None:
            non_id_fields = [name for name in fieldnames if name != "id"]
            if not non_id_fields:
                raise ValueError("CSV de teste precisa de uma coluna de texto.")
            text_field = non_id_fields[0]

        ids, texts = [], []
        for idx, row in enumerate(reader):
            row_id = row.get(id_field) if id_field else None
            ids.append(row_id if row_id is not None else str(idx))
            texts.append(row.get(text_field, ""))

    return ids, texts

def _batched(items, batch_size):
    for start in range(0, len(items), batch_size):
        yield items[start:start + batch_size]

test_csv = _resolve_test_csv(project_root)
ids, texts = _load_test_rows(test_csv)

loaded_model = loaded_model.to(device)
loaded_model.eval()

all_probs = []
batch_size = 128

with torch.no_grad():
    for batch_texts in _batched(texts, batch_size):
        inputs = loaded_tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = loaded_model(**inputs)
        probs = torch.softmax(outputs["logits"], dim=-1).cpu().numpy()
        all_probs.extend(probs)

all_probs = np.asarray(all_probs, dtype=np.float64)
if np.isnan(all_probs).any() or np.isinf(all_probs).any():
    raise ValueError("Foram encontrados NaN/Inf nas probabilidades.")

row_sums = all_probs.sum(axis=1)
if not np.allclose(row_sums, 1.0, atol=1e-6):
    raise ValueError("As probabilidades n√£o somam 1.0 em todas as linhas.")

expected_num_classes = loaded_model.config.n_classes
submission_class_columns = [f"prob_{i}" for i in range(expected_num_classes)]

submission_path = project_root / f"submission_manual_{MODEL_TYPE}.csv"
with submission_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id"] + submission_class_columns)
    for row_id, prob_row in zip(ids, all_probs):
        writer.writerow([row_id] + [f"{p:.8f}" for p in prob_row])

print(f"Arquivo de submiss√£o gerado: {submission_path}")
print(f"Total de linhas: {len(all_probs)}")