In [1]:
import warnings
warnings.filterwarnings("ignore")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd

from datasets import Dataset

from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

from transformers import pipeline
from catboost import Pool, CatBoostRanker

In [2]:
train_df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv", index_col="id")
train_df.head()

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


## Slight augmentation (translate and back)

In [3]:
# Seq2Seq

model_checkpoint1 = "Helsinki-NLP/opus-mt-en-fr"
model_checkpoint2 = "Helsinki-NLP/opus-mt-fr-en"

translator_en_fr = pipeline("translation", model=model_checkpoint1, device=0)
translator_fr_en = pipeline("translation", model=model_checkpoint2, device=0)

sample = translator_en_fr("Fine example here")[0]["translation_text"]
translator_fr_en(sample)[0]["translation_text"]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

"That's a good example here."

In [4]:
aug_train_df = train_df.copy(deep=True)

for col in aug_train_df.columns[:-1]:
    aug_train_df[col] = aug_train_df[col].apply(lambda x: translator_fr_en(translator_en_fr(x)[0]["translation_text"])[0]["translation_text"])

Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


In [5]:
aug_train_df.sample(5)

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
151,What was Isaac Newton's explanation for the st...,Isaac Newton rejected the theory of waves of l...,Isaac Newton rejected light wave theory and pr...,Isaac Newton accepted the theory of light wave...,Isaac Newton rejected light wave theory and pr...,Isaac Newton accepted light wave theory and pr...,D
2,Which of the following accurately describes th...,The symbol of the triskeles was rebuilt as a f...,The symbol of the triskeles is a representatio...,The symbol of the triskeles is a representatio...,The symbol of the triskeles represents three s...,The symbol of the triskeles is a representatio...,A
119,What is the concept of simultaneity in Einstei...,"Simultaneity is relative, which means that two...","Simultaneity is relative, which means that two...","Simultaneity is absolute, which means that two...",Simultaneity is a concept that applies only to...,Simultaneity is a concept that applies only to...,A
105,What is spatial dispersion?,Spatial dispersion is a phenomenon in the phys...,Spatial dispersion is a phenomenon in the phys...,Spatial dispersion is a phenomenon in the phys...,Spatial dispersion is a phenomenon in the phys...,Spatial dispersion is a phenomenon in the phys...,D
87,What hand should be used to apply the right ru...,The Dominant Hand,Right hand,Both hands,Left hand,In both hands,B


In [30]:
import gc

translator_en_fr.device = -1
translator_fr_en.device = -1
del translator_en_fr, translator_fr_en
gc.collect()
torch.cuda.empty_cache()

## Randomly replace words with synonyms

In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/co

In [4]:
def syn_replacer(word):
    syn = list()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            syn.append(lemma.name())
    return np.random.choice(syn) if len(syn) > 0 else word

def replace_all(row):
    words = np.random.choice(row.split(), size=int(0.1*len(row.split())), replace=False)
    for word in words:
        new_word = syn_replacer(word)
        row = row.replace(word, new_word)
    return row

In [5]:
replaced_train_df = train_df.copy(deep=True)

for col in replaced_train_df.columns[:-1]:
    replaced_train_df[col] = replaced_train_df[col].apply(replace_all)

In [6]:
replaced_train_df.sample(5)

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
198,"What is the relationship between mass, force, ...",pot is a property thastatine determines the we...,Mass is an inertial property that determines a...,Mass is Associate_in_Nursing inertial property...,Mass is an inertial property that determines a...,Mgroup_Ass is group_A property thgroup_At dete...,D
79,What is a Hilbert space in quantum mechanics?,A complex vector space where the state of a cl...,A physical space where the State_Department of...,A physical space where the state of a quantum ...,A mathematical space where the state of a clas...,A complex vector spAce where the stAte of A qu...,E
37,What is the reason behind the adoption of a lo...,The logarithmic scale was adopted to ensure th...,The logarithmic scale was adopted to measure t...,The logarithmic scale was adopted to measure t...,The logarithmic scale was adopted to ensure th...,The logarithmic scale was adopted to measure t...,A
27,What is X-ray pulsar-based navigation (XNAV)?,X-ray pulsar-based navigation (XNAV) is a navi...,X-ray pulsar-based navigation (XNAV) is a navi...,X-ray pulsar-bAmerican_Samoaed sailing (XNAV) ...,roentgenogram pulsar-based navigation (XNAV) i...,X-ray pulsar-based navigation (XNAV) is a navi...,D
38,What is the spin quantum number?,The spin quantum number is a measure of the sp...,The spin quantum number is a measure of the si...,The spin quvitamin_Antum number is vitamin_A m...,The spin quantum number is a measure of the sp...,The spin quantum number is a dimensionless qua...,E


## Merge dataframes

In [7]:
try:
    train_df = pd.concat([train_df, aug_train_df, replaced_train_df], axis=0)
except:
    train_df = pd.concat([train_df, replaced_train_df], axis=0)

## Fine-tune MultiChoice Deberta Model

In [8]:
train_ds = Dataset.from_pandas(train_df)

In [9]:
model_dir = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir)
model

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassif

DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_aff

In [10]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)  # [SEP] token, [CLS] representation for every sentence
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/400 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [12]:
model_dir = 'finetuned_deberta'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=4,
    gradient_accumulation_steps=4,
    num_train_epochs=7,
    weight_decay=0.01,
    report_to='none'
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

In [14]:
try:
    trainer.train()
except: 
    pass
    # import gc
    # model = model.to("cpu")
    # del model, trainer
    # gc.collect()
    # torch.cuda.empty_cache()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.343038
2,No log,0.522991
3,No log,0.268587
4,No log,0.12985
5,0.707700,0.073508


In [15]:
predictions = trainer.predict(tokenized_train_ds)

In [18]:
def predictions_to_map_output(predictions, topk=3):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:topk]  # Get the first topk answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

In [19]:
predictions_to_map_output(predictions.predictions)

array(['D B E', 'A B D', 'A C B', 'C E A', 'D B A', 'B E C', 'A C B',
       'D B C', 'C B E', 'A B E', 'E B A', 'A B C', 'C E A', 'D E C',
       'B C A', 'B A D', 'E D B', 'E C D', 'A D B', 'E D A', 'D C B',
       'D C E', 'C E A', 'C A D', 'E D A', 'E D C', 'A E C', 'D C B',
       'E B C', 'C D E', 'B D E', 'E A C', 'E A B', 'D A B', 'C E A',
       'B E A', 'E A B', 'A D C', 'E A B', 'E A B', 'E A D', 'C D E',
       'B C A', 'C D A', 'A B C', 'A B D', 'B C A', 'C D E', 'D A E',
       'B A D', 'B E A', 'E C A', 'C A B', 'A D B', 'B A D', 'B E C',
       'C D E', 'C B D', 'D C E', 'A E C', 'B D E', 'B E D', 'C A B',
       'C D A', 'A B D', 'E C D', 'C E D', 'E B D', 'C D A', 'D B E',
       'C A E', 'A E C', 'D A E', 'B A D', 'D B E', 'B C D', 'D B A',
       'B A C', 'C B E', 'E A C', 'C A E', 'A C D', 'A E D', 'A B E',
       'C E A', 'D C E', 'C D E', 'B A C', 'E A C', 'D A B', 'B D A',
       'B A E', 'B C E', 'E B C', 'E B A', 'C B D', 'C D A', 'D E B',
       'D B A', 'D E

In [20]:
test_df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")

# we can make predictions directly with our trainer.
test_df['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/200 [00:00<?, ?ex/s]

In [21]:
test_predictions = trainer.predict(tokenized_test_ds)

In [25]:
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)

submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,prediction
0,0,D B E
1,1,A B D
2,2,A C B
3,3,C E A
4,4,D B A


## Train ranking model using Deberta scores as relevances

In [None]:
# create ranking df, containing questions as groups and deberta scores as relevance scores
# cat = CatBoostRanker()
# rank_train = pd.melt (train_df - embeddings Q+sep+A)

In [None]:
train_pool = Pool(data=rank_train,
                  label=rank_train["label"],
                  group_id=rank_train["id"].values)

valid_pool = Pool(data=rank_valid,
                  label=rank_valid["label"],
                  group_id=rank_valid["id"].values)

In [None]:
ranker = CatBoostRanker(allow_writing_files=False, 
                        loss_function="QuerySoftMax",
                        rsm=0.9,
                        boosting_type="Plain",
                        bootstrap_type="MVS",
                        l2_leaf_reg=3,
                        depth=6,
                        iterations=1000,
                        use_best_model=True,
                        bagging_temperature=1,
                        subsample=0.8,
                        border_count=254,
                        random_seed=42,
                        random_strength=1,
                        learning_rate=0.3,
                        verbose=100,
                        task_type="GPU",
                        ).fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30)