In [1]:
import os
import sys

ROOT_PATH = os.getcwd()
if "/kaggle" in ROOT_PATH:
    ROOT_PATH = "/kaggle/input"
    sys.path.append(os.path.join(ROOT_PATH, "map-utilities"))

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset

from utils import (
    stringify_input,
    get_model_name,
    get_sequence_classifier,
    get_tokenizer,
    get_training_arguments,
    get_trainer,
)

In [3]:
EPOCHS = 10
MODEL_NAME = get_model_name("/kaggle" in ROOT_PATH, ROOT_PATH)

TRAIN_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "test.csv")

In [4]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [5]:
print("Training Shape:", train_df.shape)
print("Testing Shape:", test_df.shape)

Training Shape: (36696, 7)
Testing Shape: (3, 5)


In [6]:
train_df.Misconception = train_df.Misconception.fillna("NA")

In [7]:
train_df["predict"] = train_df.Category + ":" + train_df.Misconception

In [8]:
# train_df["is_mc_answer_correct"] = train_df.Category.str.contains("True", case=False)

In [9]:
idx = train_df.Category.str.contains("True", case=False) == True
tmp = train_df.loc[idx].copy()
tmp['c'] = tmp.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
tmp = tmp.sort_values('c',ascending=False)
tmp = tmp.drop_duplicates(['QuestionId'])
tmp = tmp[['QuestionId','MC_Answer']]
tmp['is_mc_answer_correct'] = True

train_df = train_df.merge(tmp, on=['QuestionId','MC_Answer'], how='left')
train_df.is_mc_answer_correct = train_df.is_mc_answer_correct.fillna(False)

test_df = test_df.merge(tmp, on=['QuestionId','MC_Answer'], how='left')
test_df.is_mc_answer_correct = test_df.is_mc_answer_correct.fillna(False)

  train_df.is_mc_answer_correct = train_df.is_mc_answer_correct.fillna(False)
  test_df.is_mc_answer_correct = test_df.is_mc_answer_correct.fillna(False)


In [10]:
train_df["is_student_explanation_correct"] = train_df.Category.str.contains("Correct", case=False)

In [11]:
le = LabelEncoder()

train_df["label"] = le.fit_transform(train_df["predict"])
n_classes = len(le.classes_)
print(f"Train shape: {train_df.shape} with {n_classes} predict classes")

Train shape: (36696, 11) with 65 predict classes


In [12]:
joblib.dump(le, "label_encoder.joblib")

['label_encoder.joblib']

In [13]:
train_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,predict,is_mc_answer_correct,is_student_explanation_correct,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA,True,True,37
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA,True,True,37
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA,True,False,64
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA,True,False,64
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA,True,True,37


In [14]:
model_train_df, model_val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [15]:
model_train_df["stringified_input"] = model_train_df.apply(stringify_input, axis=1)

model_train_df.stringified_input.values[:5]

array(["Math Question: Dots have been arranged in these patterns: [Image: Pattern 1 consists of 6 dots, Pattern 2 consists of 10 dots, Pattern 3 consists of 14 dots and Pattern 4 consists of 18 dots] How many dots would there be in Pattern \\( 6 \\) ?\nStudent's Answer: \\( 26 \\)\nStudent's Explanation: apart from the first it add's 4 every time\nAnswer Correctness: Correct\nExplanation Correctness: Correct\n\nTask: Identify the student's misconception category and specific misconception.",
       "Math Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]\nStudent's Answer: \\( \\frac{3}{9} \\)\nStudent's Explanation: There are 9 triangles and 3 arent shaded so this means it would be 9/3.\nAnswer Correctness: Incorrect\nExplanation Correctness: Incorrect\n\nTask: Identify the student's misconception category and specific misconception.",
       "Math Question: Dots hav

In [16]:
model_val_df.drop(columns=["is_mc_answer_correct", "is_student_explanation_correct"], inplace=True)
model_val_df["stringified_input"] = model_val_df.apply(stringify_input, axis=1)

model_val_df.stringified_input.values[:5]

array(['Math Question: The probability of an event occurring is \\( 0.9 \\).\n\nWhich of the following most accurately describes the likelihood of the event occurring?\nStudent\'s Answer: Likely\nStudent\'s Explanation: i think it\'s b because if it\'s certain, you\'re saying that it will happen 100%;however, the number is 0.9 and it isn\'t "certain" so therefore it would be likely\n\nTask: Identify the student\'s misconception category and specific misconception.',
       "Math Question: Sally has \\( \\frac{2}{3} \\) of a whole cake in the fridge. Robert eats \\( \\frac{1}{3} \\) of this piece. What fraction of the whole cake has Robert eaten?\nChoose the number sentence that would solve the word problem.\nStudent's Answer: \\( \\frac{2}{3}-\\frac{1}{3} \\)\nStudent's Explanation: roberts takes half of the 2/3 so it is -\n\nTask: Identify the student's misconception category and specific misconception.",
       "Math Question: Calculate \\( \\frac{2}{3} \\times 5 \\)\nStudent's Answe

In [17]:
COLUMNS = ["stringified_input", "label"]

train_ds = Dataset.from_pandas(model_train_df[COLUMNS])
val_ds = Dataset.from_pandas(model_val_df[COLUMNS])

In [18]:
seq_model = get_sequence_classifier(MODEL_NAME, n_classes)
tokenizer = get_tokenizer(MODEL_NAME)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def tokenize_function(examples):
    return tokenizer(examples["stringified_input"], truncation=True, padding="max_length", max_length=256)

In [20]:
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7340 [00:00<?, ? examples/s]

In [21]:
training_args = get_training_arguments()
trainer = get_trainer(
    seq_model,
    tokenizer,
    training_args,
    train_ds,
    val_ds,
)

  return Trainer(


In [None]:
trainer.train()



Step,Training Loss,Validation Loss


In [None]:
test_df.head()

In [None]:
test_df["stringified_input"] = test_df.apply(stringify_input, axis=1)
test_df.head()

In [None]:
test_ds = Dataset.from_pandas(test_df[["stringified_input"]])
test_ds = test_ds.map(tokenize_function, batched=True)

In [None]:
predictions = trainer.predict(test_ds)
probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

In [None]:
top3 = np.argsort(-probs, axis=1)[:, :3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top3_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top3_labels]

# Save submission
sub = pd.DataFrame({
    "row_id": test_df.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission.csv", index=False)
sub.head()