In [1]:
import pandas as pd
import torch
from accelerate import Accelerator
from datasets import Dataset
import numpy as np

In [2]:
# Optimizing by using GPU if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
accelerator = Accelerator()
print(f"Using device: {device}")

Using device: mps


In [3]:
df = pd.read_csv("./data/b6_train_data.csv")
# turn into a Python list for tokenization
df["choices"] = df['choices'].apply(eval)
dataset = Dataset.from_pandas(df)


test_df = pd.read_csv("./data/b6_test_data.csv")
test_df["choices"] = test_df['choices'].apply(eval)
fpttest_data = Dataset.from_pandas(test_df)

In [4]:
def show_one(example):
    print(f"{example['question']}")
    for c in example['choices']:
        print(f" - {c}")
    print(f" Correct answer: {example['answer']}")

In [5]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name).to(device)

# Wrap with `accelerate`
model = accelerator.prepare(model)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:


letter_to_number = {'A': 0, 'B': 1, 'C': 2, 'D': 3}


def get_number(ans):
    try:
        last_word = ans.split()[-1]  # Get the last word
        # Return mapped value or -1 if not found
        return letter_to_number.get(last_word, -1)
    except Exception as e:
        print(f"Error processing answer '{ans}': {e}")
        return -1  # Fallback value


def preprocess(examples):
    examples["choices"] = [
        choice if len(choice) > 0 else ["Null"]  # Ensure it's a list
        for choice in examples["choices"]
    ]

    # Number of choices per question
    choice_lens = [len(inner_list) for inner_list in examples['choices']]

    # Expand questions to match the number of choices
    questions = [q for q_list in [[question] * n for question,
                                  n in zip(examples['question'], choice_lens)] for q in q_list]
    choices = sum(examples["choices"], [])  # Flatten choices

    # Convert labels
    labels = np.array([get_number(label) for label in examples['answer']])

    # Tokenize questions and choices as independent pairs
    tokenized_examples = tokenizer(
        list(zip(questions, choices)), truncation=True, padding="max_length")

    # print(tokenized_examples)

    # Reshape data: Group every `n` choices together (for each question)
    reshaped_dict = {k: [] for k in tokenized_examples.keys()}
    start = 0
    for n in choice_lens:
        for k in tokenized_examples.keys():
            reshaped_dict[k].append(tokenized_examples[k][start: start + n])
        start += n
    # Ensure labels match question structure
    reshaped_dict['labels'] = labels

    return reshaped_dict

In [7]:
idx = 0
tokenized_data = dataset.map(
    preprocess, batched=True, batch_size=8, load_from_cache_file=False)
tokenized_data = tokenized_data.remove_columns(
    ["task_id", "question", "choices", "answer"])  # Keep only tokenized features

Map:   0%|          | 0/3963 [00:00<?, ? examples/s]

Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer 'None': 'NoneType' object has no attribute 'split'
Error processing answer '

In [8]:
decoded_text = tokenizer.decode(
    tokenized_data[0]["input_ids"][0], skip_special_tokens=True)
print(decoded_text)

question : what will be output of the following code? # include < stdio. h > int main ( ) { printf ( " % d \ t ", sizeof ( 6. 5 ) ) ; printf ( " % d \ t ", sizeof ( 90000 ) ) ; printf ( " % d ", sizeof ( ' a ' ) ) ; return 0 ; } 8 4 2


In [None]:
print(tokenized_data[:3])

In [11]:
from transformers import DataCollatorForMultipleChoice

data_collator = DataCollatorForMultipleChoice(tokenizer)

In [17]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [18]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
# Split your dataset into train and evaluation sets

train_test_split = tokenized_data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

torch.mps.empty_cache()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mcq_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # Set higher than needed, early stopping will handle it
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,  # For accuracy, higher is better
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")

Epoch,Training Loss,Validation Loss
