# MAP - Charting Student Math Misunderstandings

### Install Requirements

## Imports and Data Loading

In [1]:
# Import basic libraries
import pandas as pd  
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as plt
import io
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score, precision_score, recall_score, auc, roc_curve, roc_auc_score, precision_recall_fscore_support
import joblib


# Import tensorflow, and keras modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout, Dense, LSTM, Input, Masking, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

# Import transformers and pytorch modules
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, TrainingArguments, Trainer,  DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorForLanguageModeling
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, Trainer, TrainingArguments
from datasets import Dataset
from transformers import RobertaForSequenceClassification,DataCollatorForLanguageModeling
from transformers import DistilBertForSequenceClassification, DataCollatorForLanguageModeling
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from datasets import load_dataset, Dataset as HFDataset, DatasetDict
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, Trainer, TrainingArguments
from datasets import Dataset
from transformers import RobertaForSequenceClassification,DataCollatorForLanguageModeling
from transformers import DistilBertForSequenceClassification, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
from tqdm import tqdm
import os

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

2025-09-04 08:48:24.991097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756975705.226918      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756975705.296545      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
#  Load & train and test data
data = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')

In [3]:
data.duplicated().sum()

0

In [4]:
# Impute missing values for the `Misconception` column with NA
data['Misconception'] = data['Misconception'].fillna('NA')

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['combined_label'] = data['Category'].astype(str) + ':' + data['Misconception'].astype(str)
cat = pd.Categorical(data['combined_label'])
y = cat.codes
num_classes = len(cat.categories)
label_to_original = dict(enumerate(cat.categories))

print(f" Number of classes: {num_classes}")
print(f" Label range: 0 to {num_classes - 1}")

# Save mapping
joblib.dump(label_to_original, "label_to_original.pkl")

 Number of classes: 65
 Label range: 0 to 64


['label_to_original.pkl']

In [6]:
# drop labels that have less than 1 entities for the `combined_label` column
classes_to_keep = data['combined_label'].value_counts()[data['combined_label'].value_counts() >= 2].index
train = data[data['combined_label'].isin(classes_to_keep)] 
train.head(2)

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,combined_label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36691 entries, 0 to 36695
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              36691 non-null  int64 
 1   QuestionId          36691 non-null  int64 
 2   QuestionText        36691 non-null  object
 3   MC_Answer           36691 non-null  object
 4   StudentExplanation  36691 non-null  object
 5   Category            36691 non-null  object
 6   Misconception       36691 non-null  object
 7   combined_label      36691 non-null  object
dtypes: int64(2), object(6)
memory usage: 2.5+ MB


In [8]:
# Encode the labels to numeric values
le = LabelEncoder()
y = le.fit_transform(train['combined_label'])
num_classes = len(le.classes_)

# Save label mapping for later use during inference
label_to_original = dict(enumerate(le.classes_))
joblib.dump(label_to_original, "label_to_original.pkl")
print(f"Number of misconception classes: {num_classes}")
print(f"Label range: 0 to {num_classes-1}")

Number of misconception classes: 60
Label range: 0 to 59


In [9]:
# Determine if CUDA (GPU) is available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
train_data = train.copy()

In [11]:
# # Pretrain the scibert transformer

# # Create pretraining corpus 
# def create_pretrain_text(row):
#     return (
#         # f"[CLS] [question id] {row['QuestionId']} "
#         f"[CLC] [question] {row['QuestionText']}"
#         f"[answer] {row['MC_Answer']} [SEP]"
#         f"[explanation] {row['StudentExplanation']} [SEP]"
#         f"[category] {row['Category']} [SEP]"
#         f"[misconception] {row['Misconception']} [SEP]"
#     )

# pretrain_corpus = train_data.apply(create_pretrain_text, axis=1).tolist()

# # Create a Hugging Face Dataset
# dataset = Dataset.from_dict({'text': pretrain_corpus}) 

# # Load tokenizer and model
# scibert_base = '/kaggle/input/scibert-base-offline/transformers/default/1/scibert-base-offline'
# # scibert_base = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(scibert_base)
# model = AutoModelForMaskedLM.from_pretrained(scibert_base).to(device)

# # Tokenize the dataset with dynamic padding
# def tokenize_function(examples):
#     return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
# tokenized_dataset.set_format('torch')  

# # Data collator for MLM 
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=True,
#     mlm_probability=0.20 
# )

# # Training arguments with optimizations
# training_args = TrainingArguments(
#     output_dir='./scibert-pretrain',
#     overwrite_output_dir=True,
#     num_train_epochs=5,
#     per_device_train_batch_size=16,
#     gradient_accumulation_steps=2,
#     save_steps=5_000,
#     save_total_limit=2,
#     logging_dir='./logs',
#     logging_steps=500,  
#     report_to=[],
#     learning_rate=2e-5,
#     warmup_steps=500,
#     fp16=True,
# )

# # Trainer with optimized settings
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     data_collator=data_collator,
# )

# # Start pretraining
# trainer.train()

# # Save the pretrained model
# model.save_pretrained('./scibert-pretrained')
# tokenizer.save_pretrained('./scibert-pretrained')

In [12]:
# Create finetuning corpus 
def create_finetune_text(row):
    return (
        # f"[CLS] [question id] {row['QuestionId']}"
        f"[CLS] [question] {row['QuestionText']}"
        f"[answer] {row['MC_Answer']} [SEP]"
        f"[explanation] {row['StudentExplanation']} [SEP]"
    )

# train_data['train_text'] = train_data.apply(create_finetune_text, axis=1)

In [13]:
# train_data.head(2)

In [14]:
# Train-test split
# train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['text'].values, y, test_size=0.3, random_state=42, stratify=y)

## Finetune the SciBERT Transformer Model

In [15]:
# # Set device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Using device: {device}")
# if device.type == 'cuda':
#     print(f"GPU Name: {torch.cuda.get_device_name(0)}")

In [16]:
# # Load RoBERTa tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./scibert-pretrained")
# MAX_LEN = 256

In [17]:
# # Define custom dataset class
# class MathMisconceptionDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_len):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_len = max_len
        
#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, idx):
#         text = str(self.texts[idx])
#         label = self.labels[idx]
        
#         # Tokenize the text 
#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,      
#             max_length=self.max_len,      
#             return_token_type_ids=False,  
#             padding='max_length',         
#             truncation=True,              
#             return_attention_mask=True,   
#             return_tensors='pt',         
#         )
        
#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'label': torch.tensor(label, dtype=torch.long)
#         }

# # Create dataset instances for training and validation
# train_dataset = MathMisconceptionDataset(train_texts, train_labels, tokenizer, MAX_LEN)
# val_dataset = MathMisconceptionDataset(val_texts, val_labels, tokenizer, MAX_LEN)

In [18]:
# # Data Loading

# BATCH_SIZE = 16 

# # Create data loaders (train set)
# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True,      
#     num_workers=2,     
#     pin_memory=True)

# # Create data loaders (validation set)
# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=False,     
#     num_workers=2,
#     pin_memory=True)


In [19]:
# # Configure model
# model = AutoModelForSequenceClassification.from_pretrained(
#     "./scibert-pretrained",
#     num_labels=num_classes,  
#     output_attentions=False, 
#     output_hidden_states=False)

# model = model.to(device)

# # Configure optimizer
# optimizer = optim.AdamW(
#     model.parameters(),
#     lr=2e-5,          
#     eps=1e-8,
#     weight_decay=0.01,
# )

# # Define training parameters
# EPOCHS = 4            
# WARMUP_STEPS = 0      

# # Calculate total training steps 
# total_steps = len(train_dataloader) * EPOCHS

# # Create learning rate scheduler
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=WARMUP_STEPS,
#     num_training_steps=total_steps)

In [20]:
# # Define map3 metric
# def map_at_3(predictions, true_labels):
    
#     # Get top-3 predicted class indices for each sample in descending order
#     top_3_pred = np.argsort(predictions, axis=1)[:, -3:][:, ::-1]

#     # List average precisions 
#     aps = []  
#     for i in range(len(true_labels)):
#         true_label = true_labels[i]
#         preds = top_3_pred[i]
#         hits = (preds == true_label)
#         if not np.any(hits):
#             aps.append(0.0)  
#         else:
#             rank = np.where(hits)[0][0] + 1
#             precision_at_k = 1.0 / rank
#             aps.append(precision_at_k)
            
#     return np.mean(aps)

In [21]:
# # Define training function
# def train_epoch(model, dataloader, optimizer, scheduler, device):
#     model.train()
#     total_loss = 0
#     all_preds = []
#     all_labels = []

#     # Process batches with progress bar
#     for batch in tqdm(dataloader, desc="Training"):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)
        
#         # Clear previous gradients
#         model.zero_grad()
        
#         # Forward pass 
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels
#         )
        
#         loss = outputs.loss
#         logits = outputs.logits
        
#         # Backward pass 
#         loss.backward()
        
#         # Clip gradients to prevent exploding gradients
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
#         # Update model parameters
#         optimizer.step()
        
#         # Update learning rate
#         scheduler.step()
        
#         total_loss += loss.item()
        
#         # Store predictions and labels for metric calculation
#         logits = logits.detach().cpu().numpy()
#         label_ids = labels.cpu().numpy()
        
#         all_preds.extend(logits)
#         all_labels.extend(label_ids)
    
#     # Calculate metrics
#     avg_loss = total_loss / len(dataloader)
#     train_map3 = map_at_3(np.array(all_preds), np.array(all_labels))
    
#     return avg_loss, train_map3

In [22]:
# # Define evaluation function
# def eval_model(model, dataloader, device):
#     model.eval()  
#     total_loss = 0
#     all_preds = []
#     all_labels = []
    
#     # Disable gradient calculation 
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['label'].to(device)
            
#             # Forward pass
#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask,
#                 labels=labels
#             )
            
#             loss = outputs.loss
#             logits = outputs.logits
            
#             total_loss += loss.item()
            
#             # Store predictions and labels for metric calculation
#             logits = logits.detach().cpu().numpy()
#             label_ids = labels.cpu().numpy()
            
#             all_preds.extend(logits)
#             all_labels.extend(label_ids)
    
#     # Calculate metrics
#     avg_loss = total_loss / len(dataloader)
#     val_map3 = map_at_3(np.array(all_preds), np.array(all_labels))
    
#     return avg_loss, val_map3

In [23]:
# # Train model
# from tqdm import tqdm
# best_val_map3 = 0
# for epoch in range(EPOCHS):
#     print(f"\nEpoch {epoch + 1}/{EPOCHS}")
#     print('-' * 30)
    
#     train_loss, train_map3 = train_epoch(
#         model,
#         train_dataloader,
#         optimizer,
#         scheduler,
#         device)
    
#     print(f"Train Loss: {train_loss:.4f} | Train MAP@3: {train_map3:.4f}")
    
#     # Evaluate model on validation set
#     val_loss, val_map3 = eval_model(
#         model,
#         val_dataloader,
#         device)
    
#     print(f"Validation Loss: {val_loss:.4f} | Validation MAP@3: {val_map3:.4f}")
    
#     # Save the best model based on validation MAP@3
#     if val_map3 > best_val_map3:
#         best_val_map3 = val_map3
#         # Save the model and tokenizer
#         model.save_pretrained("./scibert-finetuned")
#         tokenizer.save_pretrained("./scibert-finetuned")
#         print(f"Saved improved model with Validation MAP@3: {val_map3:.4f}")

# print("\nFine-tuning completed.")
# print(f"Best Validation MAP@3 achieved: {best_val_map3:.4f}")

In [24]:
# # Save finetuned RoBERTa transformer model
# import zipfile

# def zip_model_directory():
#     zip_name = "sci_bert_misconception.zip"
#     model_dir = "./sci_bert_misconception"
    
#     # Create zip file
#     with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
#         for root, _, files in os.walk(model_dir):
#             for file in files:
#                 file_path = os.path.join(root, file)
#                 arcname = os.path.relpath(file_path, model_dir)
#                 zipf.write(file_path, arcname)
    
#     print(f"\nModel successfully zipped as '{zip_name}'")
#     print(f"Total size: {os.path.getsize(zip_name) / (1024 * 1024):.2f} MB")
    
#     return zip_name

# # Create and display download link 
# try:
#     from IPython.display import FileLink, display
#     zip_file = zip_model_directory()
#     print("\nClick the link below to download the fine-tuned model:")
#     display(FileLink(zip_file))
# except:
#     zip_file = zip_model_directory()

## Make Predictions on Test Set Using Finetuned Model

In [25]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import joblib

# Set device for computation 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/scibert-pre-finetuned-0.9350/transformers/default/1/scibert-pre-finetuned 0.9350')
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/input/scibert-pre-finetuned-0.9350/transformers/default/1/scibert-pre-finetuned 0.9350')
model.to(device)
model.eval() 
print("Fine-tuned sciBERT model loaded successfully")

# Load label mapping
label_to_original = joblib.load("label_to_original.pkl")
num_classes = len(label_to_original)
print(f"Number of classes: {num_classes}")

# Pre-process test dataset
test_df = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")
test_df['text_test'] = test_df.apply(create_finetune_text, axis=1)
texts = test_df['text_test'].astype(str).tolist()  

# Define dataset class for inference
class InferenceDataset(Dataset):    
    def __init__(self, texts, tokenizer, max_len):
        # Ensure texts is a list of strings
        self.texts = [str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.texts):
            raise IndexError(f"Index {idx} is out of range for dataset with length {len(self.texts)}")
        
        text = self.texts[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # Return PyTorch tensors
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),        
            'attention_mask': encoding['attention_mask'].flatten(), 
        }

# Parameters
MAX_LEN = 256
BATCH_SIZE = 16

# Create dataset and dataloader
inference_dataset = InferenceDataset(
    texts=texts,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

inference_dataloader = DataLoader(
    inference_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,        
    pin_memory=True       
)

# Prediction function
def predict_with_scibert(model, dataloader, device):
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)       
            attention_mask = batch['attention_mask'].to(device) 
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits                         
            
            probs = F.softmax(logits, dim=1)              
            all_predictions.extend(probs.cpu().numpy())    
    
    return np.array(all_predictions)

# Run inference
pred_proba = predict_with_scibert(model, inference_dataloader, device)

# Decode predictions: get top-3 class IDs, map to original labels
submission_strings = []
for prob in pred_proba:
    # Get top-3 class indices (descending order)
    top_3_indices = np.argsort(prob)[-3:][::-1]
    top_3_labels = [label_to_original[i] for i in top_3_indices]
    submission_strings.append(" ".join(top_3_labels))

# Create submission file
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'Category:Misconception': submission_strings
})

# Save to CSV
submission_df.to_csv("submission.csv", index=False)
print("Submission file saved: submission.csv")

Using device: cpu
Fine-tuned sciBERT model loaded successfully
Number of classes: 60
Submission file saved: submission.csv


In [26]:
submission_df.head()

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Correct:NA
1,36697,False_Misconception:WNB False_Neither:NA False...
2,36698,True_Neither:NA True_Correct:NA True_Misconcep...
