In [1]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

2025-07-22 04:23:06.538486: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753158186.721547      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753158186.771108      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
EPOCHS = 25
MODEL_NAME = "microsoft/deberta-v3-small"

# ROOT_PATH = os.getcwd()
ROOT_PATH = "/kaggle/input"

TRAIN_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "test.csv")

In [3]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [4]:
train_df.Misconception = train_df.Misconception.fillna("NA")

In [5]:
train_df["predict"] = train_df.Category + ":" + train_df.Misconception

In [6]:
train_df["is_mc_answer_correct"] = train_df.Category.str.contains("True", case=False)

In [7]:
train_df["is_student_explanation_correct"] = train_df.Category.str.contains("Correct", case=False)

In [8]:
le = LabelEncoder()

train_df["label"] = le.fit_transform(train_df["predict"])
n_classes = len(le.classes_)
print(f"Train shape: {train_df.shape} with {n_classes} predict classes")

Train shape: (36696, 11) with 65 predict classes


In [9]:
train_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,predict,is_mc_answer_correct,is_student_explanation_correct,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA,True,True,37
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA,True,True,37
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA,True,False,64
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA,True,False,64
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA,True,True,37


In [10]:
def stringify_input(row):
    output = (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Student Explanation: {row['StudentExplanation']}\n\n"
    )
    if "is_mc_answer_correct" in row:
        output += f"Is the student's answer correct? {row['is_mc_answer_correct']}\n"
    if "is_student_explanation_correct" in row:
        output += f"Is the student's explanation correct? {row['is_student_explanation_correct']}\n"
    return output.strip()


train_df["stringified_input"] = train_df.apply(stringify_input, axis=1)

train_df.stringified_input.values[:5]

array(["Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]\nAnswer: \\( \\frac{1}{3} \\)\nStudent Explanation: 0ne third is equal to tree nineth\n\nIs the student's answer correct? True\nIs the student's explanation correct? True",
       "Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]\nAnswer: \\( \\frac{1}{3} \\)\nStudent Explanation: 1 / 3 because 6 over 9 is 2 thirds and 1 third is not shaded.\n\nIs the student's answer correct? True\nIs the student's explanation correct? True",
       "Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]\nAnswer: \\( \\frac{1}{3} \\)\nStudent Explanation: 1 3rd is half of 3 6th, so it is simp

In [11]:
model_train_df, model_val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [12]:
COLUMNS = ["stringified_input", "label"]

train_ds = Dataset.from_pandas(model_train_df[COLUMNS])
val_ds = Dataset.from_pandas(model_val_df[COLUMNS])

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

seq_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=n_classes)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def tokenize_function(examples):
    return tokenizer(examples["stringified_input"], truncation=True, padding="max_length", max_length=256)

In [15]:
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Map:   0%|          | 0/7340 [00:00<?, ? examples/s]

In [16]:
def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

    top3 = np.argsort(-probs, axis=1)[:, :3]  # Top 3 predictions
    match = (top3 == labels[:, None])

    # Compute MAP@3 manually
    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}

In [17]:
training_args = TrainingArguments(
    output_dir = "./output",
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    save_strategy="steps", #no for no saving
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=1,
    metric_for_best_model="map@3",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="none",
    # use_mps_device=True,  # Use MPS for Apple Silicon
    bf16=True, # TRAIN WITH BF16 IF LOCAL GPU IS NEWER GPU
    # fp16=True, # INFER WITH FP16 BECAUSE KAGGLE IS T4 GPU
)

In [18]:
trainer = Trainer(
    model=seq_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_map3,
)

  trainer = Trainer(


In [19]:
trainer.train()

Step,Training Loss,Validation Loss,Map@3
200,0.9701,0.916413,0.79723
400,0.571,0.543048,0.885014
600,0.4459,0.42593,0.913601
800,0.3633,0.352295,0.933856
1000,0.3318,0.314398,0.936807
1200,0.2897,0.295605,0.94337
1400,0.2918,0.294979,0.945095
1600,0.253,0.269279,0.950295
1800,0.2611,0.271849,0.947956
2000,0.2119,0.264307,0.951067


TrainOutput(global_step=11475, training_loss=0.1420377886606977, metrics={'train_runtime': 15574.0664, 'train_samples_per_second': 47.123, 'train_steps_per_second': 0.737, 'total_flos': 4.86652562932224e+16, 'train_loss': 0.1420377886606977, 'epoch': 25.0})

In [20]:
test_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation
0,36696,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"I think that 1/3 is the answer, as it's the si..."
1,36697,31772,What fraction of the shape is not shaded? Give...,\( \frac{3}{6} \),i think this answer is because 3 triangles are...
2,36698,32835,Which number is the greatest?,\( 6.2 \),because the 2 makes it higher than the others.


In [21]:
test_df["stringified_input"] = test_df.apply(stringify_input, axis=1)
test_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,stringified_input
0,36696,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"I think that 1/3 is the answer, as it's the si...",Question: What fraction of the shape is not sh...
1,36697,31772,What fraction of the shape is not shaded? Give...,\( \frac{3}{6} \),i think this answer is because 3 triangles are...,Question: What fraction of the shape is not sh...
2,36698,32835,Which number is the greatest?,\( 6.2 \),because the 2 makes it higher than the others.,Question: Which number is the greatest?\nAnswe...


In [22]:
test_ds = Dataset.from_pandas(test_df[["stringified_input"]])
test_ds = test_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [23]:
predictions = trainer.predict(test_ds)
probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

In [24]:
top3 = np.argsort(-probs, axis=1)[:, :3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top3_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top3_labels]

# Save submission
sub = pd.DataFrame({
    "row_id": test_df.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,row_id,Category:Misconception
0,36696,False_Neither:NA False_Misconception:Incomplet...
1,36697,False_Misconception:WNB False_Neither:NA False...
2,36698,False_Neither:NA False_Misconception:Longer_is...
