In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
%env WANDB_PROJECT=LLM_Science_Exam

env: WANDB_PROJECT=LLM_Science_Exam


In [1]:

import pandas as pd
from datasets import Dataset,DatasetDict
from transformers import AutoTokenizer,AutoModelForMultipleChoice, TrainingArguments, Trainer, BitsAndBytesConfig
from accelerate import Accelerator
import peft
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import datetime
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Def Functions

options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True) # tokenizer call using 'text_pair' which basically just adds a separator between the two sentences
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [3]:
# Load in Model and Tokenizer
current_device = Accelerator().process_index
model_path = 'bert-large-cased'
tokenizer = AutoTokenizer.from_pretrained(model_path)


# model = AutoModelForMultipleChoice.from_pretrained(model_path,
#                                                     device_map={"": current_device})#,
                                                    #torch_dtype=torch.float16)
# model=peft.prepare_model_for_int8_training(model_path)



In [31]:
# Read in Train & Test data.

train_df = pd.read_csv('6000_train_examples.csv')
train_df=train_df.dropna()
train_ds = Dataset.from_pandas(train_df)
train_ds = train_ds.remove_columns(['__index_level_0__'])
# tokenized_train_ds = train_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer',"__index_level_0__"])

val_df = pd.read_csv('train.csv')
val_df = val_df.dropna().drop('id',axis=1)
val_ds = Dataset.from_pandas(val_df)
tokenized_val_ds = val_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

datasets = DatasetDict({
    "train":train_ds,
    "validation":val_ds
})
encoded_datasets = datasets.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])


test_df = pd.read_csv('test.csv')
test_df['answer'] = 'A'
test_ds = Dataset.from_pandas(test_df.drop('id',axis=1))
tokenized_test_ds = test_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])


Map: 100%|██████████| 200/200 [00:00<00:00, 1931.84 examples/s]
Map: 100%|██████████| 5997/5997 [00:02<00:00, 2153.76 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1996.76 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2014.43 examples/s]


In [33]:
test_ds

Dataset({
    features: ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 200
})

In [32]:
tokenized_test_ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 200
})

In [34]:
from torch.utils.data import DataLoader
test_dataloader = DataLoader(tokenized_test_ds, 10, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))

In [35]:
test_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fcee8ecacb0>

In [36]:
model = AutoModelForMultipleChoice.from_pretrained(f'base_models/bert-large-cased/bert-large-cased/',device_map={"": current_device})
model = peft.PeftModel.from_pretrained(model, 
                                       'peft_adapters/bert-large-cased/', 
                                       is_trainable=False)
model.eval()
preds = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    preds.append(outputs.logits.cpu().detach())

bert_large_preds = torch.cat(preds)

Some weights of the model checkpoint at base_models/bert-large-cased/bert-large-cased/ were not used when initializing BertForMultipleChoice: ['bert.encoder.layer.2.attention.self.value.lora_A.default.weight', 'bert.encoder.layer.17.attention.self.query.lora_B.default.weight', 'bert.encoder.layer.11.attention.self.query.lora_A.default.weight', 'bert.encoder.layer.12.attention.self.value.lora_B.default.weight', 'bert.encoder.layer.13.attention.self.value.lora_A.default.weight', 'bert.encoder.layer.20.attention.self.value.lora_A.default.weight', 'bert.encoder.layer.9.attention.self.value.lora_B.default.weight', 'bert.encoder.layer.19.attention.self.query.lora_B.default.weight', 'bert.encoder.layer.0.attention.self.value.lora_B.default.weight', 'bert.encoder.layer.18.attention.self.value.lora_A.default.weight', 'bert.encoder.layer.19.attention.self.value.lora_B.default.weight', 'classifier.modules_to_save.default.weight', 'bert.encoder.layer.8.attention.self.value.lora_B.default.weight', 

In [54]:
model = AutoModelForMultipleChoice.from_pretrained(f'base_models/full_debertav3/full_debertav3/',device_map={"": current_device})
model = peft.PeftModel.from_pretrained(model, 
                                       'peft_adapters/full_debertav3/', 
                                       is_trainable=False)
model.eval()
preds = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    preds.append(outputs.logits.cpu().detach())

deberta_large_preds = torch.cat(preds)

Some weights of the model checkpoint at base_models/full_debertav3/full_debertav3/ were not used when initializing DebertaV2ForMultipleChoice: ['deberta.encoder.layer.8.attention.self.value_proj.lora_B.default.weight', 'deberta.encoder.layer.4.attention.self.query_proj.lora_A.default.weight', 'deberta.encoder.layer.2.attention.self.value_proj.lora_B.default.weight', 'deberta.encoder.layer.10.attention.self.value_proj.lora_A.default.weight', 'deberta.encoder.layer.2.attention.self.query_proj.lora_A.default.weight', 'classifier.original_module.weight', 'deberta.encoder.layer.3.attention.self.query_proj.lora_A.default.weight', 'deberta.encoder.layer.5.attention.self.query_proj.lora_B.default.weight', 'deberta.encoder.layer.6.attention.self.value_proj.lora_B.default.weight', 'deberta.encoder.layer.1.attention.self.value_proj.lora_B.default.weight', 'deberta.encoder.layer.6.attention.self.query_proj.lora_A.default.weight', 'deberta.encoder.layer.6.attention.self.value_proj.lora_A.default.we

In [58]:
torch.cuda.empty_cache() 

In [57]:
del model

In [56]:
bert_large_preds.shape, deberta_large_preds.shape

(torch.Size([200, 5]), torch.Size([200, 5]))

In [60]:
from collections import defaultdict

voting_ensemble = defaultdict(list)

In [61]:
 for row in range(bert_large_preds.shape[0]):
        preds = bert_large_preds[row]
        voting_ensemble[row].append(preds.argsort(descending=True)[:3])
        
 for row in range(deberta_large_preds.shape[0]):
        preds = bert_large_preds[row]
        voting_ensemble[row].append(preds.argsort(descending=True)[:3])

In [70]:
voting_ensemble[0][:1]

[tensor([1, 3, 0])]

In [72]:
predictions = []
for row in range(bert_large_preds.shape[0]):
    votes = defaultdict(lambda: 0)
    
    # for preds in voting_ensemble[row][:3]: for when using 3 sets of predicions on different models
    #     votes[preds[0].item()] += 3
    #     votes[preds[1].item()] += 2
    #     votes[preds[2].item()] += 1
    
    bert_large_preds = voting_ensemble[row][0]
    votes[bert_large_preds[0].item()] += 3 * 3 # never unseat top prediction by `deberta_large_preds` even with 3,3,3 from my weights
    votes[bert_large_preds[1].item()] += 2 * 2 
    votes[bert_large_preds[2].item()] += 1 * 1 
    
    deberta_large_preds = voting_ensemble[row][1]
    votes[deberta_large_preds[0].item()] += 3 * 3.1 # never unseat top prediction by `deberta_large_preds` even with 3,3,3 from my weights
    votes[deberta_large_preds[1].item()] += 2 * 2.9 
    votes[deberta_large_preds[2].item()] += 1 * 2.9 
        
    predictions.append([t[0] for t in sorted(votes.items(), key=lambda x:x[1], reverse=True)][:3])

In [74]:
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions]
predictions_as_answer_letters[:3]

array([['B', 'D', 'A'],
       ['A', 'B', 'E'],
       ['C', 'A', 'E']], dtype='<U1')

In [75]:
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]
predictions_as_string[:3]

['B D A', 'A B E', 'C A E']

In [76]:
submission = test_df[['id', 'prediction']]
submission.to_csv('submission.csv', index=False)

pd.read_csv('submission.csv').head()

Unnamed: 0,id,prediction
0,0,B D A
1,1,A B E
2,2,C A E
3,3,A C B
4,4,E C A
