## NLP Final Project

In [7]:
from huggingface_hub import login
import wandb
from dotenv import load_dotenv
import os

load_dotenv()

# Access the environment variables from the .env file
hf_token = os.environ.get('HF_TOKEN')
wandb_token = os.environ.get('WANDB_TOKEN')

wandb.login(key=wandb_token)
login(token=hf_token)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnickrwu[0m ([33mnick-wu[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/nrw9167/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/nrw9167/.cache/huggingface/token
Login successful


## 1: Load Dataset & Pre-Trained Model
We used [**MathQA**](https://huggingface.co/datasets/math_qa) to finetune our model

In [8]:
from datasets import load_dataset

# Testing with Smaller subset of the data
# mathqa = load_dataset("math_qa", split="train[:5000]")
# mathqa = mathqa.train_test_split(test_size=0.2)

# Initialize dataset and available models
mathqa = load_dataset("math_qa")
model_name = "LIAMF-USP/roberta-large-finetuned-race"

model_names = ["LIAMF-USP/roberta-large-finetuned-race", "microsoft/deberta-v3-large", "google/bigbird-roberta-large", "xlnet/xlnet-base-cased", "FacebookAI/xlm-roberta-base", "distilbert/distilbert-base-uncased"]

In [9]:
mathqa['train'][0]

{'Problem': "the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?",
 'Rationale': '"explanation : t = 3 years r = 10 % td = ( bg × 100 ) / tr = ( 36 × 100 ) / ( 3 × 10 ) = 12 × 10 = rs . 120 td = ( pw × tr ) / 100 ⇒ 120 = ( pw × 3 × 10 ) / 100 ⇒ 1200 = pw × 3 pw = 1200 / 3 = rs . 400 answer : option a"',
 'options': 'a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these',
 'correct': 'a',
 'annotated_formula': 'divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))',
 'linear_formula': 'multiply(n2,const_100)|multiply(n0,n1)|divide(#0,#1)|multiply(#2,const_100)|divide(#3,#1)|',
 'category': 'gain'}

## 2: Cleaning and Pre-Processing

In [4]:
print(mathqa)

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 29837
    })
    test: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 2985
    })
    validation: Dataset({
        features: ['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'],
        num_rows: 4475
    })
})


In [5]:
def split_options(example):
    example["options"] = example['options'].split(", ")
    return example

def filter_by_length(example):
    return len(example['options']) == 5

mathqa = mathqa.map(split_options)
mathqa = mathqa.filter(filter_by_length)

In [6]:
import re

def remove_answer_from_rationale(example):
    # More complex patterns to catch various ways answers are indicated
    patterns = [
        r'\banswer\s*[:.]\s*[a-e]\b',           # "answer: a" or "answer. a"
        r'\banswer\s*is\s*[a-e]\b',             # "answer is a"
        r'\banswer\s*[a-e]\b',                  # "answer a"
        r'\bcorrect\s*option\s*[:.]\s*[a-e]\b', # "correct option: a"
        r'\bans\s*[:.]\s*[a-e]\b',              # "ans: a"
        r'\bimo\s*[a-e]\b',                     # "imo a"
        r'\b[a-e]\)\b',                         # "a)"
        r'\b[a-e]\.\b',                         # "a."
        r'\b[a-e]\b\s*is\s*correct\b',          # "a is correct"
        r'\b[a-e]\b\s*is\s*the\s*answer\b',     # "a is the answer"
        r'\b[a-e]\b\s*-\s*',                    # "a -"
        r'\boption\s*[a-e]\b',                  # "option a"
        r'\bnone of these\b',                   # "none of these"
        r'\b[a-e]\b\s*is\s*right\b',            # "a is right"
        r'([a-eA-E])(?!.*[a-eA-E])',
    ]

    # Replace identified patterns with empty string
    for pattern in patterns:
        example["Rationale"] = re.sub(pattern, '', example["Rationale"], flags=re.IGNORECASE)

    # Clean up multiple spaces and newlines
    example["Rationale"] = re.sub(r'(.*=).*', r'\1', example["Rationale"])
    example["Rationale"] = re.sub(r'\s{2,}', ' ', example["Rationale"])
    example["Rationale"] = re.sub(r'\n+', '\n', example["Rationale"])
    
    example["Rationale"] = example["Rationale"].strip()

    return example

mathqa = mathqa.map(remove_answer_from_rationale)

In [7]:
mathqa['train'][0]

{'Problem': "the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?",
 'Rationale': '"explanation : t = 3 years r = 10 % td = ( bg × 100 ) / tr = ( 36 × 100 ) / ( 3 × 10 ) = 12 × 10 = rs . 120 td = ( pw × tr ) / 100 ⇒ 120 = ( pw × 3 × 10 ) / 100 ⇒ 1200 = pw × 3 pw = 1200 / 3 =',
 'options': ['a ) rs . 400 ',
  'b ) rs . 300 ',
  'c ) rs . 500 ',
  'd ) rs . 350 ',
  'e ) none of these'],
 'correct': 'a',
 'annotated_formula': 'divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))',
 'linear_formula': 'multiply(n2,const_100)|multiply(n0,n1)|divide(#0,#1)|multiply(#2,const_100)|divide(#3,#1)|',
 'category': 'gain'}

In [8]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
import torch
from accelerate import Accelerator

# Initialize Accelerator
accelerator = Accelerator()

In [9]:
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': precision_recall_fscore_support(p.label_ids, preds, average='macro')[2],
        'precision': precision_recall_fscore_support(p.label_ids, preds, average='macro')[0],
        'recall': precision_recall_fscore_support(p.label_ids, preds, average='macro')[1]
}

In [10]:
def preprocess_function(examples, tokenizer, rationale=False, formula=False):
    MAX_SEQ_LENGTH = tokenizer.model_max_length if tokenizer.model_max_length < 512 else 256
    
    labels_map = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    questions = examples["Problem"]
    contexts = examples["Rationale"]
    formulas = examples['annotated_formula']
    options_list = examples["options"]
    categories = examples["category"]
    labels = [labels_map[ans] for ans in examples["correct"]]

    batch_input_ids = []
    batch_attention_masks = []
    batch_labels = []
    batch_categories = []
    
    # Iterate over each example in the batch
    for question, category, context, options, formula label in zip(questions, categories, contexts, options_list, formulas, labels):
        choices_inputs = []

        for option in options:
            # Combined Question with each Option
            input_string = f'[CATEGORY] {category} [PROBLEM] {question} [CONTEXT] {option}'

            if rationale:
                re.
                input_string = f'{input_string}'

            if formula:
                input_string = f'{input_string} [FORMULA] {formula}'
            
            combined_text = f'[CATEGORY] {category} [PROBLEM] {question} [RATIONALE] {context} {option}'

            # Tokenize the context and the question-option pair
            inputs = tokenizer(
                combined_text,
                add_special_tokens=True,
                max_length=MAX_SEQ_LENGTH,
                padding="max_length",
                truncation=True,
                return_overflowing_tokens=False
            )
            
            choices_inputs.append(inputs)

        # Extract input ids and attention masks for all options
        input_ids = [x['input_ids'] for x in choices_inputs]
        attention_masks = [x['attention_mask'] for x in choices_inputs]
        
        batch_input_ids.append(input_ids)
        batch_attention_masks.append(attention_masks)
        batch_labels.append(label)

    # Return processed batch data as a dictionary
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_masks,
        "labels": torch.tensor(batch_labels, dtype=torch.long),
    }

## 3: Evaluating Base Models

In [11]:
# Initialize base models and tokenizers
models = { name: AutoModelForMultipleChoice.from_pretrained(name) for name in model_names }
tokenizers = { name: AutoTokenizer.from_pretrained(name) for name in model_names }

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BigBirdForMultipleChoice were not initialized from the model checkpoint at google/bigbird-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForMultipleChoice were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [12]:
tokenized_datasets = {name: mathqa['test'].map(preprocess_function, fn_kwargs={'tokenizer': tkn}, batched=True) for name, tkn in tokenizers.items()}

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

### 3.1: First Iteration

In [11]:
# 1st Iteration
results = {}
for name, model in models.items():
    trainer = Trainer(
        model=model,
        eval_dataset=tokenized_datasets[name],
        compute_metrics=compute_metrics
    )
    results[name] = trainer.evaluate()

print(results)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnickrwu[0m ([33mnick-wu[0m). Use [1m`wandb login --relogin`[0m to force relogin


Attention type 'block_sparse' is not possible if sequence_length: 256 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


{'LIAMF-USP/roberta-large-finetuned-race': {'eval_loss': 1.5375503301620483, 'eval_accuracy': 0.49210084033613444, 'eval_f1': 0.48690145392783163, 'eval_precision': 0.4895015998986854, 'eval_recall': 0.4858255651820508, 'eval_runtime': 200.4258, 'eval_samples_per_second': 14.843, 'eval_steps_per_second': 1.856}, 'microsoft/deberta-v3-large': {'eval_loss': 1.6093789339065552, 'eval_accuracy': 0.24, 'eval_f1': 0.23916774512522507, 'eval_precision': 0.254085226353791, 'eval_recall': 0.24635359483100033, 'eval_runtime': 258.2878, 'eval_samples_per_second': 11.518, 'eval_steps_per_second': 1.44}, 'google/bigbird-roberta-large': {'eval_loss': 1.606080412864685, 'eval_accuracy': 0.293109243697479, 'eval_f1': 0.2926189170937211, 'eval_precision': 0.29562990901530106, 'eval_recall': 0.2948272429784039, 'eval_runtime': 231.6373, 'eval_samples_per_second': 12.843, 'eval_steps_per_second': 1.606}, 'xlnet/xlnet-base-cased': {'eval_loss': 1.6289170980453491, 'eval_accuracy': 0.16, 'eval_f1': 0.15297

In [12]:
# 1st Iteration
for key in results.keys():
    print(f"{key}: {results[key]}\n")

LIAMF-USP/roberta-large-finetuned-race: {'eval_loss': 1.5375503301620483, 'eval_accuracy': 0.49210084033613444, 'eval_f1': 0.48690145392783163, 'eval_precision': 0.4895015998986854, 'eval_recall': 0.4858255651820508, 'eval_runtime': 200.4258, 'eval_samples_per_second': 14.843, 'eval_steps_per_second': 1.856}

microsoft/deberta-v3-large: {'eval_loss': 1.6093789339065552, 'eval_accuracy': 0.24, 'eval_f1': 0.23916774512522507, 'eval_precision': 0.254085226353791, 'eval_recall': 0.24635359483100033, 'eval_runtime': 258.2878, 'eval_samples_per_second': 11.518, 'eval_steps_per_second': 1.44}

google/bigbird-roberta-large: {'eval_loss': 1.606080412864685, 'eval_accuracy': 0.293109243697479, 'eval_f1': 0.2926189170937211, 'eval_precision': 0.29562990901530106, 'eval_recall': 0.2948272429784039, 'eval_runtime': 231.6373, 'eval_samples_per_second': 12.843, 'eval_steps_per_second': 1.606}

xlnet/xlnet-base-cased: {'eval_loss': 1.6289170980453491, 'eval_accuracy': 0.16, 'eval_f1': 0.15297957666690

### 3.2: Second Iteration

In [13]:
# 2nd Iteration
results = {}
for name, model in models.items():
    trainer = Trainer(
        model=model,
        eval_dataset=tokenized_datasets[name],
        compute_metrics=compute_metrics
    )
    results[name] = trainer.evaluate()

print(results)

Attention type 'block_sparse' is not possible if sequence_length: 256 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


{'LIAMF-USP/roberta-large-finetuned-race': {'eval_loss': 1.7576335668563843, 'eval_accuracy': 0.22453781512605042, 'eval_f1': 0.21967586975974313, 'eval_precision': 0.22253264398415445, 'eval_recall': 0.2213882836360634, 'eval_runtime': 212.6478, 'eval_samples_per_second': 13.99, 'eval_steps_per_second': 1.749}, 'microsoft/deberta-v3-large': {'eval_loss': 1.6093631982803345, 'eval_accuracy': 0.24873949579831933, 'eval_f1': 0.24674655202020634, 'eval_precision': 0.25300166132945645, 'eval_recall': 0.2481927322246705, 'eval_runtime': 271.6509, 'eval_samples_per_second': 10.952, 'eval_steps_per_second': 1.369}, 'google/bigbird-roberta-large': {'eval_loss': 1.6111717224121094, 'eval_accuracy': 0.20941176470588235, 'eval_f1': 0.20773049763904478, 'eval_precision': 0.21088324137587286, 'eval_recall': 0.20831432813874046, 'eval_runtime': 238.9903, 'eval_samples_per_second': 12.448, 'eval_steps_per_second': 1.557}, 'xlnet/xlnet-base-cased': {'eval_loss': 1.6096572875976562, 'eval_accuracy': 0.

In [14]:
# 2nd Iteration
for key in results.keys():
    print(f"{key}: {results[key]}\n")

LIAMF-USP/roberta-large-finetuned-race: {'eval_loss': 1.7576335668563843, 'eval_accuracy': 0.22453781512605042, 'eval_f1': 0.21967586975974313, 'eval_precision': 0.22253264398415445, 'eval_recall': 0.2213882836360634, 'eval_runtime': 212.6478, 'eval_samples_per_second': 13.99, 'eval_steps_per_second': 1.749}

microsoft/deberta-v3-large: {'eval_loss': 1.6093631982803345, 'eval_accuracy': 0.24873949579831933, 'eval_f1': 0.24674655202020634, 'eval_precision': 0.25300166132945645, 'eval_recall': 0.2481927322246705, 'eval_runtime': 271.6509, 'eval_samples_per_second': 10.952, 'eval_steps_per_second': 1.369}

google/bigbird-roberta-large: {'eval_loss': 1.6111717224121094, 'eval_accuracy': 0.20941176470588235, 'eval_f1': 0.20773049763904478, 'eval_precision': 0.21088324137587286, 'eval_recall': 0.20831432813874046, 'eval_runtime': 238.9903, 'eval_samples_per_second': 12.448, 'eval_steps_per_second': 1.557}

xlnet/xlnet-base-cased: {'eval_loss': 1.6096572875976562, 'eval_accuracy': 0.210084033

## 4: Fine-Tuned Model Preparation

In [11]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # Determine the label key in the features
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)

        # Find the maximum number of choices across all samples (to handle variable numbers safely)
        max_num_choices = max(len(feature["input_ids"]) for feature in features)

        # Flatten the features for padding, ensuring all have the same number of choices
        flattened_features = []
        for feature in features:
            feature_choices = []
            for i in range(max_num_choices):
                try:
                    # Extract each choice as a separate feature
                    choice_features = {k: v[i] for k, v in feature.items() if k != label_name and isinstance(v, list)}
                    feature_choices.append(choice_features)
                except IndexError:
                    # If some choices are missing, pad manually
                    # Use the structure of the first choice to create empty padding
                    empty_choice = {k: [] * len(v[0]) if isinstance(v[0], list) else v for k, v in feature.items() if k != label_name and isinstance(v, list)}
                    feature_choices.append(empty_choice)
            flattened_features.extend(feature_choices)

        # Pad the flattened features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reshape the padded features back into their original shape [batch_size, num_choices, sequence_length]
        batch = {k: v.view(batch_size, max_num_choices, -1) for k, v in batch.items() if v.dim() > 1}

        # Add back the labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)

        return batch


In [12]:
# Initialize fine-tuned model and tokenizer
model_name= "microsoft/deberta-v3-large"
finetuned_model = AutoModelForMultipleChoice.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
tokenized_mathqa = mathqa.map(preprocess_function, fn_kwargs={'tokenizer': tokenizer}, batched=True, remove_columns=mathqa["train"].column_names)

In [14]:
accepted_keys = ["input_ids", "attention_mask", "labels"]
features = [{k: v for k, v in tokenized_mathqa["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

idx = 0
[tokenizer.decode(batch["input_ids"][idx][i].tolist()) for i in range(5)]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['[CLS] [CATEGORY] gain [PROBLEM] the banker\'s gain of a certain sum due 3 years hence at 10 % per annum is rs. 36. what is the present worth? [RATIONALE] "explanation : t = 3 years r = 10 % td = ( bg × 100 ) / tr = ( 36 × 100 ) / ( 3 × 10 ) = 12 × 10 = rs. 120 td = ( pw × tr ) / 100 ⇒ 120 = ( pw × 3 × 10 ) / 100 ⇒ 1200 = pw × 3 pw = 1200 / 3 = a ) rs. 400[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

## 5: Fine-tuning Model

In [15]:
import gc

# Delete tensors
gc.collect()  # Garbage collect to free memory

torch.cuda.empty_cache()

In [25]:
# Iteration 2: RoBERTA


# batch_size = 1

# Define training arguments
# training_args = TrainingArguments(
#     output_dir=f"{model_name}-finetuned-mathqa",
#     evaluation_strategy = "epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     push_to_hub=True,
# )

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-mathqa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # Adjust batch size depending on the available GPU memory
    per_device_eval_batch_size=16,  # Evaluation batch size can be larger if evaluation is less frequent
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_mathqa["train"],
    eval_dataset=tokenized_mathqa["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3497,1.288583,0.465882,0.464044,0.476122,0.460934
2,1.2074,1.168439,0.518655,0.518212,0.52534,0.515323
3,1.0072,1.128198,0.547563,0.547022,0.552756,0.544449


TrainOutput(global_step=11136, training_loss=1.252042861848042, metrics={'train_runtime': 6197.7167, 'train_samples_per_second': 14.374, 'train_steps_per_second': 1.797, 'total_flos': 2.075520624780672e+17, 'train_loss': 1.252042861848042, 'epoch': 3.0})

In [26]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nickrwu/roberta-large-finetuned-race-finetuned-mathqa/commit/1b254161977e4aa443c91774a59af0d484e650e4', commit_message='End of training', commit_description='', oid='1b254161977e4aa443c91774a59af0d484e650e4', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
finetuned_eval_result = trainer.evaluate(tokenized_mathqa["test"])

print(f"{model_name}-finetuned-mathqa: {finetuned_eval_result}")

LIAMF-USP/roberta-large-finetuned-race-finetuned-mathqa: {'eval_loss': 1.128198266029358, 'eval_accuracy': 0.547563025210084, 'eval_f1': 0.5470219441640726, 'eval_precision': 0.5527563562833936, 'eval_recall': 0.5444486622799508, 'eval_runtime': 62.4166, 'eval_samples_per_second': 47.664, 'eval_steps_per_second': 2.98, 'epoch': 3.0}


In [None]:
# deBERTa


training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-mathqa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6, # Adjust batch size depending on the available GPU memory
    per_device_eval_batch_size=16,  # Evaluation batch size can be larger if evaluation is less frequent
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_mathqa["train"],
    eval_dataset=tokenized_mathqa["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6067,1.609375,0.204795,0.166807,0.192065,0.196832
2,1.6099,1.609375,0.201658,0.168353,0.186478,0.192136


In [None]:
trainer.push_to_hub()

In [None]:
finetuned_eval_result = trainer.evaluate(tokenized_mathqa["test"])

print(f"{model_name}-finetuned-mathqa: {finetuned_eval_result}")

In [18]:
# Iteration 1 RoBERTA


# batch_size = 1

# Define training arguments
# training_args = TrainingArguments(
#     output_dir=f"{model_name}-finetuned-mathqa",
#     evaluation_strategy = "epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     push_to_hub=True,
# )

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-mathqa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10, # Adjust batch size depending on the available GPU memory
    per_device_eval_batch_size=16,  # Evaluation batch size can be larger if evaluation is less frequent
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_mathqa["train"],
    eval_dataset=tokenized_mathqa["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2057,0.174363,0.956303,0.956123,0.955748,0.956546
2,0.1484,0.164226,0.958319,0.958263,0.958815,0.957779
3,0.1309,0.166103,0.964034,0.963642,0.963798,0.96351


TrainOutput(global_step=8910, training_loss=0.18729583658821522, metrics={'train_runtime': 5964.2442, 'train_samples_per_second': 14.937, 'train_steps_per_second': 1.494, 'total_flos': 2.075520624780672e+17, 'train_loss': 0.18729583658821522, 'epoch': 3.0})

In [19]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nickrwu/roberta-large-finetuned-race-finetuned-mathqa/commit/75e0925b987f6537b6cda1f391b778ad2806aeaf', commit_message='End of training', commit_description='', oid='75e0925b987f6537b6cda1f391b778ad2806aeaf', pr_url=None, pr_revision=None, pr_num=None)

In [35]:
finetuned_eval_result = trainer.evaluate(tokenized_mathqa["test"])

print(f"{model_name}-finetuned-mathqa: {finetuned_eval_result}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 6: Evaluation
Compare Base Model vs. Fine-Tuned Model

In [15]:
checkpoint_model_name = 'nickrwu/roberta-large-finetuned-race-finetuned-mathqa'
# 'nickrwu/distilbert-base-uncased-finetuned-mathqa'

saved_model = AutoModelForMultipleChoice.from_pretrained(checkpoint_model_name)
saved_tokenizer = AutoTokenizer.from_pretrained(checkpoint_model_name)

saved_tokenized_mathqa = mathqa.map(preprocess_function, fn_kwargs={'tokenizer': saved_tokenizer}, batched=True, remove_columns=mathqa["train"].column_names)

In [16]:
# Initialize Trainer
saved_trainer = Trainer(
    model=saved_model,
    train_dataset=saved_tokenized_mathqa["train"],
    eval_dataset=saved_tokenized_mathqa["test"],
    tokenizer=saved_tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=saved_tokenizer),
    compute_metrics=compute_metrics
)

In [17]:
saved_finetuned_eval = saved_trainer.evaluate(saved_tokenized_mathqa["test"])

print(f"{model_name}-finetuned-mathqa: {saved_finetuned_eval}")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


microsoft/deberta-v3-large-finetuned-mathqa: {'eval_loss': 1.128219485282898, 'eval_accuracy': 0.547563025210084, 'eval_f1': 0.5470297154863124, 'eval_precision': 0.5527901947183171, 'eval_recall': 0.5444486622799508, 'eval_runtime': 209.1467, 'eval_samples_per_second': 14.224, 'eval_steps_per_second': 1.779}


In [23]:
from sklearn.metrics import confusion_matrix, classification_report

def generate_report(trainer, test_dataset):
    # Predictions
    raw_pred, _, _ = trainer.predict(test_dataset)
    predicted_labels = np.argmax(raw_pred, axis=1)
    
    # Evaluate predictions
    true_labels = test_dataset['labels']
    accuracy = accuracy_score(true_labels, predicted_labels)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)
    report = classification_report(true_labels, predicted_labels)
    
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", report)

    return true_labels, predicted_labels

true_labels, predicted_labels = generate_report(saved_trainer, saved_tokenized_mathqa["test"])


Accuracy: 0.547563025210084
Confusion Matrix:
 [[349  79  91  50  42]
 [ 83 332  84  78  28]
 [ 91 103 369  70  41]
 [ 83  79  70 355  37]
 [ 62  53  65  57 224]]
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.57      0.55       611
           1       0.51      0.55      0.53       605
           2       0.54      0.55      0.55       674
           3       0.58      0.57      0.58       624
           4       0.60      0.49      0.54       461

    accuracy                           0.55      2975
   macro avg       0.55      0.54      0.55      2975
weighted avg       0.55      0.55      0.55      2975



In [25]:
def print_incorrect(true_labels, predicted_labels, test_dataset, n):
    incorrect_indices = np.where(np.array(true_labels) != predicted_labels)[0]
    incorrect_samples = tokenized_mathqa["test"].select(incorrect_indices)
    counter = 0
    for i, example in enumerate(incorrect_samples):
        true_label = example['labels']
        predicted_label = predicted_labels[incorrect_indices[i]]
        answer_map = {0:"a", 1:"b", 2: "c", 3:"d", 4:"e"}
    
        print(f"\n[True ({answer_map[true_label]})] \n{tokenizer.decode(example["input_ids"][true_label], skip_special_tokens=True)}")
        print(f"\n[Predicted ({answer_map[predicted_label]})] \n{tokenizer.decode(example["input_ids"][predicted_label], skip_special_tokens=True)}")
        
        print("---------")
        counter += 1
        if counter > n:
            break

print_incorrect(true_labels, predicted_labels, saved_tokenized_mathqa["test"], 5)


# accepted_keys = ["input_ids", "attention_mask", "labels"]
# features = [{k: v for k, v in tokenized_mathqa["train"][i].items() if k in accepted_keys} for i in range(10)]
# batch = DataCollatorForMultipleChoice(tokenizer)(features)





[True (e)] 
[CATEGORY] gain [PROBLEM] the present population of a town is 3888. population increase rate is 20 % p. a. find the population of town before 2 years? [RATIONALE] "p = 3888 r = 20 % required population of town = p / ( 1 + r / 100 ) ^ t = 3888 / ( 1 + 20 / 100 ) ^ 2 = 3888 / ( 6 / 5 ) ^ 2 = e ) 2700

[Predicted (d)] 
[CATEGORY] gain [PROBLEM] the present population of a town is 3888. population increase rate is 20 % p. a. find the population of town before 2 years? [RATIONALE] "p = 3888 r = 20 % required population of town = p / ( 1 + r / 100 ) ^ t = 3888 / ( 1 + 20 / 100 ) ^ 2 = 3888 / ( 6 / 5 ) ^ 2 = d ) 3600
---------

[True (a)] 
[CATEGORY] geometry [PROBLEM] a full stationary oil tank that is a right circular cylinder has a radius of 100 feet and a height of 25 feet. oil is pumped from the stationary tank to an oil truck that has a tank that is a right circular cylinder until the truck's tank is completely filled. if the truck's tank has a radius of 6 feet and a height

In [29]:
def validation_preprocess(examples, base_tokenizer):
    MAX_SEQ_LENGTH = tokenizer.model_max_length if tokenizer.model_max_length < 512 else 256
    
    labels_map = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    questions = examples["Problem"]
    options_list = examples["options"]
    labels = [labels_map[ans] for ans in examples["correct"]]

    batch_input_ids = []
    batch_attention_masks = []
    batch_labels = []
    
    # Iterate over each example in the batch
    for question, options, label in zip(questions, options_list, labels):
        choices_inputs = []

        for option in options:
            if "_" in question:
                # Fill-in-the-blank question type
                question_option = question.replace("_", option)
            else:
                # Standard question appended with option
                question_option = question + " " + option

            # Tokenize the context and the question-option pair
            inputs = base_tokenizer(
                question_option,
                add_special_tokens=True,
                max_length=MAX_SEQ_LENGTH,
                padding="max_length",
                truncation=True,
                return_overflowing_tokens=False
            )
            
            choices_inputs.append(inputs)

        # Extract input ids and attention masks for all options
        input_ids = [x['input_ids'] for x in choices_inputs]
        attention_masks = [x['attention_mask'] for x in choices_inputs]
        
        batch_input_ids.append(input_ids)
        batch_attention_masks.append(attention_masks)
        batch_labels.append(label)

    # Return processed batch data as a dictionary
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_masks,
        "labels": torch.tensor(batch_labels, dtype=torch.long)
    }

In [30]:
validation = mathqa['validation'].map(validation_preprocess, fn_kwargs={'base_tokenizer': tokenizer}, batched=True)

Map:   0%|          | 0/4463 [00:00<?, ? examples/s]

In [31]:
validation_trainer = Trainer(
    model=saved_model,
    train_dataset=tokenized_mathqa["train"],
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [32]:
validation_finetuned_eval = validation_trainer.evaluate()

print(f"{model_name}-finetuned-mathqa: {validation_finetuned_eval}")

LIAMF-USP/roberta-large-finetuned-race-finetuned-mathqa: {'eval_loss': 2.4840052127838135, 'eval_accuracy': 0.23840466054223616, 'eval_f1': 0.2363083024937068, 'eval_precision': 0.23824828135053347, 'eval_recall': 0.23696357278014527, 'eval_runtime': 313.8069, 'eval_samples_per_second': 14.222, 'eval_steps_per_second': 1.778}


In [55]:
# Testing Accuracy with Annotated Formula
def test_preprocess(examples, base_tokenizer):
    # MAX_SEQ_LENGTH = tokenizer.model_max_length if tokenizer.model_max_length < 512 else 256
    MAX_SEQ_LENGTH = 128
    
    labels_map = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    questions = examples["Problem"]
    contexts = examples['annotated_formula']
    options_list = examples["options"]
    labels = [labels_map[ans] for ans in examples["correct"]]

    batch_input_ids = []
    batch_attention_masks = []
    batch_labels = []
    
    # Iterate over each example in the batch
    for question, options, context, label in zip(questions, options_list, contexts, labels):
        choices_inputs = []

        for option in options:
            if "_" in question:
                # Fill-in-the-blank question type
                question_option = question.replace("_", option)
            else:
                # Standard question appended with option
                question_option = question + " " + option

            # Tokenize the context and the question-option pair
            inputs = base_tokenizer(
                context,
                question_option,
                add_special_tokens=True,
                max_length=MAX_SEQ_LENGTH,
                padding="max_length",
                truncation=True,
                return_overflowing_tokens=False
            )
            
            choices_inputs.append(inputs)

        # Extract input ids and attention masks for all options
        input_ids = [x['input_ids'] for x in choices_inputs]
        attention_masks = [x['attention_mask'] for x in choices_inputs]
        
        batch_input_ids.append(input_ids)
        batch_attention_masks.append(attention_masks)
        batch_labels.append(label)

    # Return processed batch data as a dictionary
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_masks,
        "labels": torch.tensor(batch_labels, dtype=torch.long)
    }

In [56]:
test_tokenized_mathqa = mathqa.map(test_preprocess, fn_kwargs={'base_tokenizer': tokenizer}, batched=True, remove_columns=mathqa["train"].column_names)

Map:   0%|          | 0/29695 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/4463 [00:00<?, ? examples/s]

In [57]:
accepted_keys = ["input_ids", "attention_mask", "labels"]
features = [{k: v for k, v in test_tokenized_mathqa["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

[tokenizer.decode(batch["input_ids"][0][i].tolist()) for i in range(5)]

["<s>divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))</s></s>the banker's gain of a certain sum due 3 years hence at 10 % per annum is rs. 36. what is the present worth? a ) rs. 400 </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
 "<s>divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))</s></s>the banker's gain of a certain sum due 3 years hence at 10 % per annum is rs. 36. what is the present worth? b ) rs. 300 </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
 "<s>divid

In [58]:
import gc

# Delete tensors
gc.collect()  # Garbage collect to free memory

torch.cuda.empty_cache()

In [59]:
test_training_args = TrainingArguments(
    output_dir=f"test-roberta-finetuned-mathqa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10, # Adjust batch size depending on the available GPU memory
    per_device_eval_batch_size=16,  # Evaluation batch size can be larger if evaluation is less frequent
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True
)

# Initialize Trainer
test_trainer = Trainer(
    model=finetuned_model,
    args=test_training_args,
    train_dataset=test_tokenized_mathqa["train"],
    eval_dataset=test_tokenized_mathqa["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the Model
test_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6207,1.609375,0.206387,0.071415,0.169368,0.200964
2,1.6136,1.609375,0.206387,0.095095,0.193391,0.201954
3,1.6161,1.609375,0.200672,0.108894,0.178234,0.195389


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=8910, training_loss=1.618311787186782, metrics={'train_runtime': 3094.1493, 'train_samples_per_second': 28.791, 'train_steps_per_second': 2.88, 'total_flos': 1.037760312390336e+17, 'train_loss': 1.618311787186782, 'epoch': 3.0})

In [60]:
test_trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nickrwu/test-roberta-finetuned-mathqa/commit/0ff55c99ccca15407636c6e934b55d422be8452e', commit_message='End of training', commit_description='', oid='0ff55c99ccca15407636c6e934b55d422be8452e', pr_url=None, pr_revision=None, pr_num=None)

In [61]:
test_finetuned_eval_result = test_trainer.evaluate()

print(f"test-roberta-finetuned-mathqa: {test_finetuned_eval_result}")

test-roberta-finetuned-mathqa: {'eval_loss': 1.609375, 'eval_accuracy': 0.200672268907563, 'eval_f1': 0.10889350517271401, 'eval_precision': 0.17823394469928303, 'eval_recall': 0.1953893437463511, 'eval_runtime': 28.4967, 'eval_samples_per_second': 104.398, 'eval_steps_per_second': 6.527, 'epoch': 3.0}


In [12]:
# Initialize fine-tuned model and tokenizer
finetuned_model = AutoModelForMultipleChoice.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForMultipleChoice were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir=f"distilbert-base-finetuned-mathqa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10, # Adjust batch size depending on the available GPU memory
    per_device_eval_batch_size=16,  # Evaluation batch size can be larger if evaluation is less frequent
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=tokenized_mathqa["train"],
    eval_dataset=tokenized_mathqa["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2186,0.184857,0.945546,0.945632,0.945963,0.945429
2,0.1889,0.168656,0.95395,0.953973,0.953946,0.954017
3,0.1528,0.164762,0.957647,0.957688,0.957837,0.957554


TrainOutput(global_step=8910, training_loss=0.22166800188563354, metrics={'train_runtime': 1236.9228, 'train_samples_per_second': 72.021, 'train_steps_per_second': 7.203, 'total_flos': 2.95016193942912e+16, 'train_loss': 0.22166800188563354, 'epoch': 3.0})

In [18]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/nickrwu/distilbert-base-finetuned-mathqa/commit/cd3e120f9c5ac915306996b06e2754b013c9ebbf', commit_message='End of training', commit_description='', oid='cd3e120f9c5ac915306996b06e2754b013c9ebbf', pr_url=None, pr_revision=None, pr_num=None)