In [1]:
#All required imports
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [5]:
train_file =  r"C:\Users\pktpa\OneDrive\Documents\Masters 2023\Lecture Notes\Natural Language Processing\Project\brainteaser_project\SP-train.npy"      #Path of Train File
read_file = np.load(train_file, allow_pickle = True)                  #Read input file

df = pd.DataFrame(read_file.tolist())                                 #Convert Numpy to DataFrame
df['distractor1'] = df['distractor1'].astype(str)                     #convert distractor1 to string type
df['distractor2'] = df['distractor2'].astype(str)                     #convert distractor2 to string type
df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)       #convert distractor(unsure) to string type

# Get train, test and validation set
train1_df, test_df = train_test_split(df, test_size=0.2, random_state= 42)   #Split train to train val split
train_df, val_df = train_test_split(train1_df, test_size=0.2, random_state= 42)   #Split train to train val split

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

dataset_dict = DatasetDict({'train': train_dataset, 'validation': val_dataset})

In [6]:
#tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [7]:
def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples["question"]]
    second_sentences = [context for context in examples["choice_list"]]
    
    first_sentences = sum(first_sentences,[])
    second_sentences = sum(second_sentences,[])
    
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation = True)
    #tokenized_examples['label'] = examples['label']
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

tokenized_data = dataset_dict.map(preprocess_function, batched = True, remove_columns=['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order', '__index_level_0__'])
#tokenized_data = dataset_dict.map(preprocess_function, batched = True) #, remove_columns=['id', 'question', 'distractor1', 'distractor2', 'distractor(unsure)'])

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

In [8]:
# Tokenize test set
test_dataset = Dataset.from_pandas(test_df)
dataset_dict2 = DatasetDict({'test': test_dataset})
test_tokenized_data = dataset_dict2.map(preprocess_function, batched = True, remove_columns=['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order', '__index_level_0__'])

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

In [9]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [10]:
#Bert
from transformers import AutoModelForMultipleChoice, TrainingArguments,Trainer
model=AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')

#Roberta
#from transformers import RobertaForMultipleChoice, TrainingArguments,Trainer
#model = RobertaForMultipleChoice.from_pretrained("roberta-base")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#pip install accelerate -U

In [55]:
#!pip install wandb

In [10]:
%env WANDB_NOTEBOOK_NAME "Bert BrainTeaserrr"


env: WANDB_NOTEBOOK_NAME="Bert BrainTeaserrr"


In [11]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mpktpaulie[0m ([33mbrainteaser[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
training_args=TrainingArguments(output_dir=r"C:\Users\pktpa\OneDrive\Desktop",
                               evaluation_strategy='epoch',
                               save_strategy='epoch',
                               load_best_model_at_end=True,
                               learning_rate=2e-5,
                               per_device_eval_batch_size=4,
                               per_device_train_batch_size=4,
                               num_train_epochs=3,
                               weight_decay=0.01,
                               report_to='none',
                               #save_model_every_epoch=False,
                               #train_batch_size=128,
                               #eval_batch_size=64,
                               #best_model_dir=r" ",
                               
                               )

In [27]:
wandb.init(project="brainteaser", name="roberta_test", config=training_args)

In [13]:
trainer=Trainer(model=model,args=training_args,
               train_dataset=tokenized_data['train'],
               eval_dataset=tokenized_data['validation'],
               tokenizer=tokenizer,
               data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer))

In [14]:
# Log training loss and epoch
wandb.log({"train_loss": trainer.state.global_step, "epoch": trainer.state.epoch})

Error: You must call wandb.init() before wandb.log()

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.379004
2,No log,0.353915
3,No log,0.350459


TrainOutput(global_step=243, training_loss=0.06820841188783999, metrics={'train_runtime': 1418.1519, 'train_samples_per_second': 0.685, 'train_steps_per_second': 0.171, 'total_flos': 171061761178080.0, 'train_loss': 0.06820841188783999, 'epoch': 3.0})

In [15]:
wandb.finish()

In [32]:
trainer.evaluate()

{'eval_loss': 0.35045933723449707,
 'eval_runtime': 25.8485,
 'eval_samples_per_second': 3.134,
 'eval_steps_per_second': 0.812,
 'epoch': 3.0}

In [38]:
predictions = trainer.predict(test_tokenized_data['test'])

In [41]:
# Extract logits from the predictions
logits = predictions.predictions
logits = torch.tensor(logits)
# Apply softmax to convert logits to probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted labels based on the maximum probability
predicted_labels = torch.argmax(probs, dim=-1)

In [43]:
predicted_labels = predicted_labels.numpy()


In [16]:
actual_labels = test_df['label']
actual_labels

In [49]:
# Calculate accuracy
correct_predictions = (predicted_labels == actual_labels).sum().item()
total_examples = len(actual_labels)
accuracy = correct_predictions / total_examples

print("Correct Predictions:", correct_predictions)
print("Total Examples:", total_examples)
print("Accuracy:", accuracy)

Correct Predictions: 95
Total Examples: 102
Accuracy: 0.9313725490196079


In [50]:
test_df['Pred Labels'] = predicted_labels

In [51]:
def select_answers(row):
    bert_answers = [row['choice_list'][index] for index in predicted_labels]
    return bert_answers

test_df['Predicted'] = test_df.apply(select_answers, axis=1)

In [52]:
output_dir = r"C:\Users\pktpa\OneDrive\Desktop"
val_df.to_csv(r"C:\Users\pktpa\OneDrive\Desktop\bert\bert_predictions.csv", mode="w", header=True, index=False)

In [53]:
answers = test_df['choice_list']