In [None]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

In [None]:
EPOCHS = 1
MODEL_NAME = "microsoft/deberta-v3-small"

ROOT_PATH = os.getcwd()

TRAIN_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "test.csv")

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [None]:
train_df.Misconception = train_df.Misconception.fillna("NA")

In [None]:
train_df["predict"] = train_df.Category + ":" + train_df.Misconception

In [None]:
train_df["is_mc_answer_correct"] = train_df.Category.str.contains("True", case=False)

In [None]:
train_df["is_student_explanation_correct"] = train_df.Category.str.contains("Correct", case=False)

In [None]:
le = LabelEncoder()

train_df["label"] = le.fit_transform(train_df["predict"])
n_classes = len(le.classes_)
print(f"Train shape: {train_df.shape} with {n_classes} predict classes")

In [None]:
train_df.head()

In [None]:
def stringify_input(row):
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Student Explanation: {row['StudentExplanation']}\n\n"
        f"Is the student's answer correct? {row['is_mc_answer_correct']}\n"
        f"Is the student's explanation correct? {row['is_student_explanation_correct']}\n"
    )

train_df["stringified_input"] = train_df.apply(stringify_input, axis=1)

train_df.stringified_input.values[:5]

In [None]:
model_train_df, model_val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
COLUMNS = ["stringified_input", "label"]

train_ds = Dataset.from_pandas(model_train_df[COLUMNS])
val_ds = Dataset.from_pandas(model_val_df[COLUMNS])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

seq_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=n_classes)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["stringified_input"], truncation=True, padding="max_length", max_length=256)

In [None]:
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

In [None]:
def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

    top3 = np.argsort(-probs, axis=1)[:, :3]  # Top 3 predictions
    match = (top3 == labels[:, None])

    # Compute MAP@3 manually
    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}

In [None]:
training_args = TrainingArguments(
    output_dir = "./output",
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    save_strategy="steps", #no for no saving
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=16*2,
    per_device_eval_batch_size=32*2,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=1,
    metric_for_best_model="map@3",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="none",
    # use_mps_device=True,  # Use MPS for Apple Silicon
    bf16=True, # TRAIN WITH BF16 IF LOCAL GPU IS NEWER GPU
    # fp16=True, # INFER WITH FP16 BECAUSE KAGGLE IS T4 GPU
)

In [None]:
trainer = Trainer(
    model=seq_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_map3,
)

In [None]:
trainer.train()

In [None]:
test_df.head()

In [None]:
test_df = test_df.merge(
    train_df[
        [
            "QuestionText",
            "MC_Answer",
            "is_mc_answer_correct",
            "is_student_explanation_correct",
        ]
    ],
    how="left",
    on=["QuestionText", "MC_Answer"],
)

test_df["stringified_input"] = test_df.apply(stringify_input, axis=1)
test_df.head()

In [None]:
test_ds = Dataset.from_pandas(test_df[["stringified_input"]])
test_ds = test_ds.map(tokenize_function, batched=True)

In [None]:
predictions = trainer.predict(test_ds)
probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

In [None]:
top3 = np.argsort(-probs, axis=1)[:, :3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top3_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top3_labels]

# Save submission
sub = pd.DataFrame({
    "row_id": test_df.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission.csv", index=False)
sub.head()