<a href="https://colab.research.google.com/github/marty916/AI-Training-Colab-Notebooks/blob/main/Eedi_Mining_Misconceptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers sentence-transformers datasets

# Import libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers import SentencesDataset
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO)


In [None]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
misconception_df = pd.read_csv('misconception_mapping.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')


In [None]:
# Data cleaning and preprocessing
def preprocess_text(text):
    """
    Preprocess the text by replacing mathematical symbols and handling special characters.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    str: The preprocessed text.
    """
    if pd.isna(text):
        return ''
    # Replace mathematical symbols with words
    text = re.sub(r'÷', ' division ', text)
    text = re.sub(r'×', ' multiplication ', text)
    text = re.sub(r'−', ' minus ', text)
    text = re.sub(r'√', ' square root ', text)
    text = re.sub(r'π', ' pi ', text)
    text = re.sub(r'∠', ' angle ', text)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# Apply preprocessing to train and test data
for df in [train_df, test_df]:
    df.fillna('', inplace=True)
    df['QuestionText'] = df['QuestionText'].apply(preprocess_text)
    for option in ['A', 'B', 'C', 'D']:
        df[f'Answer{option}Text'] = df[f'Answer{option}Text'].apply(preprocess_text)
    df['ConstructName'] = df['ConstructName'].apply(preprocess_text)


In [None]:
def get_distractors(df, is_train=True):
    """
    Extract distractors from the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame containing question data.
    is_train (bool): Flag indicating if the DataFrame is training data.

    Returns:
    pd.DataFrame: DataFrame with distractor information.
    """
    distractors = []
    for idx, row in df.iterrows():
        correct_answer = row['CorrectAnswer']
        for option in ['A', 'B', 'C', 'D']:
            if option != correct_answer:
                distractor = {
                    'QuestionId': row['QuestionId'],
                    'AnswerOption': option,
                    'AnswerText': row[f'Answer{option}Text'],
                    'QuestionText': row['QuestionText'],
                    'ConstructName': row['ConstructName'],
                    'SubjectName': row['SubjectName']
                }
                if is_train:
                    distractor['MisconceptionId'] = row.get(f'Misconception{option}Id', None)
                distractors.append(distractor)
    return pd.DataFrame(distractors)

# Prepare distractor data for training and testing
train_distractors = get_distractors(train_df, is_train=True)
test_distractors = get_distractors(test_df, is_train=False)


In [None]:
# Prepare misconception texts
misconception_df['MisconceptionText'] = misconception_df['MisconceptionName'].apply(preprocess_text)


In [None]:
# Load pre-trained sentence transformer model
# Consider using a domain-specific model if available
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')  # Set device to CPU

# Prepare training data for supervised fine-tuning
train_distractors['CombinedText'] = (
    train_distractors['QuestionText'] + ' Answer: ' +
    train_distractors['AnswerText'] + ' Construct: ' +
    train_distractors['ConstructName'] + ' Subject: ' +
    train_distractors['SubjectName']
)
train_texts = train_distractors['CombinedText'].tolist()
# Convert 'MisconceptionId' to numeric, coerce invalid values to NaN, and then to 0
train_labels = pd.to_numeric(train_distractors['MisconceptionId'], errors='coerce').fillna(0).astype(int).tolist()

# Create InputExamples for training
train_examples = [
    InputExample(texts=[text, misconception_df.loc[misconception_df['MisconceptionId'] == label, 'MisconceptionText'].values[0]],
                 label=1.0)
    for text, label in zip(train_texts, train_labels)
]

# Negative samples for contrastive learning
negative_examples = []
for text in train_texts:
    negative_misconceptions = misconception_df.sample(n=3)['MisconceptionText'].tolist()
    for neg_mis in negative_misconceptions:
        negative_examples.append(InputExample(texts=[text, neg_mis], label=0.0))

# Combine positive and negative examples
train_examples.extend(negative_examples)

# DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Loss function
train_loss = losses.CosineSimilarityLoss(model)


In [None]:
# Fine-tune the model on training data
num_epochs = 1  # Adjust as needed
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=int(len(train_dataloader) * num_epochs * 0.1)
)


In [None]:
# Generate embeddings for misconceptions
misconception_texts = misconception_df['MisconceptionText'].tolist()
misconception_embeddings = model.encode(misconception_texts, convert_to_tensor=True)


In [None]:
# Prepare distractor texts for test data
test_distractors['CombinedText'] = (
    test_distractors['QuestionText'] + ' Answer: ' +
    test_distractors['AnswerText'] + ' Construct: ' +
    test_distractors['ConstructName'] + ' Subject: ' +
    test_distractors['SubjectName']
)
test_texts = test_distractors['CombinedText'].tolist()

# Generate embeddings for distractors
distractor_embeddings = model.encode(test_texts, convert_to_tensor=True)


In [None]:
# Compute similarity scores in batches to manage memory
batch_size = 100
all_top_k_misconception_ids = []
top_k = 25

for i in range(0, len(distractor_embeddings), batch_size):
    batch_embeddings = distractor_embeddings[i:i+batch_size]
    similarity_scores = util.cos_sim(batch_embeddings, misconception_embeddings)
    top_k_values, top_k_indices = torch.topk(similarity_scores, k=top_k, dim=1)
    misconception_ids_array = misconception_df['MisconceptionId'].values
    batch_top_k_misconception_ids = misconception_ids_array[top_k_indices.cpu().numpy()]
    all_top_k_misconception_ids.extend(batch_top_k_misconception_ids)

    # Free memory
    del batch_embeddings, similarity_scores, top_k_values, top_k_indices

# Ensure that we have predictions for all distractors
assert len(all_top_k_misconception_ids) == len(test_distractors), "Mismatch in prediction lengths."


In [None]:
# Prepare QuestionId_Answer
test_distractors['QuestionId_Answer'] = test_distractors['QuestionId'].astype(str) + '_' + test_distractors['AnswerOption']

# Prepare MisconceptionId predictions
test_distractors['MisconceptionId'] = [
    ' '.join(map(str, ids)) for ids in all_top_k_misconception_ids
]

# Prepare submission DataFrame
submission_df = test_distractors[['QuestionId_Answer', 'MisconceptionId']]

# Ensure that submission_df has all required rows from sample_submission.csv
submission_df = sample_submission_df[['QuestionId_Answer']].merge(submission_df, on='QuestionId_Answer', how='left')

# Handle missing MisconceptionId predictions
if submission_df['MisconceptionId'].isnull().any():
    # Get most frequent MisconceptionIds from training data
    most_common_misconceptions = train_distractors['MisconceptionId'].value_counts().index[:25].tolist()
    default_misconceptions = ' '.join(map(str, most_common_misconceptions))
    submission_df['MisconceptionId'].fillna(default_misconceptions, inplace=True)

# Validate submission format
assert len(submission_df) == len(sample_submission_df), "Submission file has incorrect number of rows."
missing_ids = set(sample_submission_df['QuestionId_Answer']) - set(submission_df['QuestionId_Answer'])
assert len(missing_ids) == 0, f"Missing QuestionId_Answer entries: {missing_ids}"


In [None]:
# Save submission file
submission_df.to_csv('submission.csv', index=False)


Only run this to download the model for Kaggle