In [None]:
# !pip install git+https://github.com/huggingface/transformers
# !pip install sentencepiece
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install datasets colorama
# !pip install protobuf

In [None]:
import os, gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

VER=1
# TRAIN WITH SUBSET OF 60K
NUM_TRAIN_SAMPLES = 1_024
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 18
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 512
# HUGGING FACE MODEL
# MODEL = 'MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli'
MODEL = 'microsoft/deberta-v3-large'

In [None]:
df_valid = pd.read_csv('./sail-val-stem-fold.csv')
print('Validation data size:', df_valid.shape )
df_valid.head()

Validation data size: (2051, 10)


Unnamed: 0,prompt,context,A,B,C,D,E,answer,ict_prompt,fold
0,Which of the following statements accurately d...,The observed CMB angular power spectrum provid...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,Which of the following statements accurately d...,0
1,Which of the following is an accurate definiti...,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Which of the following is an accurate definiti...,2
2,Which of the following statements accurately d...,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,Which of the following statements accurately d...,1
3,Which of the following statements accurately d...,"Thus, it is 0 for sets describing points (0-di...",The dimension of an object in a CW complex is ...,The dimension of an object in a CW complex is ...,The dimension of an object in a CW complex is ...,The dimension of an object in a CW complex is ...,The dimension of an object in a CW complex dep...,A,Which of the following statements accurately d...,0
4,What is the role of axioms in a formal theory?,A formal system is an abstract structure used ...,Basis statements called axioms form the founda...,Axioms are supplementary statements added to a...,Axioms are redundant statements that can be de...,The axioms in a theory are used for experiment...,The axioms in a formal theory are added to pro...,A,What is the role of axioms in a formal theory?...,2


In [None]:
train_df = pd.read_csv('./sail-train-stem-fold.csv')
train_df = train_df.fillna('')
print('Train data size:', train_df.shape )
train_df.head()

Train data size: (14971, 10)


Unnamed: 0,prompt,context,A,B,C,D,E,answer,ict_prompt,fold
0,What is the meaning of the Latin-derived speci...,The specific epithet vaginatus is derived from...,"""Tall""","""Hairy""","""Sheath""","""Grass-like""","""Round""",C,What is the meaning of the Latin-derived speci...,2
1,What was Wilhelm Schepmann's position in the N...,__NOTOC__ Wilhelm Schepmann (17 June 1894 – 26...,Wilhelm Schepmann was an officer in the Nazi p...,"Wilhelm Schepmann was a member of the SS, a se...",Wilhelm Schepmann was an SA general in Nazi Ge...,Wilhelm Schepmann was a civilian bureaucrat wh...,Wilhelm Schepmann was a prominent leader in th...,C,What was Wilhelm Schepmann's position in the N...,0
2,How did the 2017-18 Charlotte 49ers men's bask...,The 2016–17 Charlotte 49ers men's basketball t...,The team qualified for the C-USA tournament.,The team did not play any games during the sea...,"The team had a losing season, finishing with m...","The team had a winning season, finishing with ...","The team had an even season, finishing with an...",C,How did the 2017-18 Charlotte 49ers men's bask...,0
3,"According to the provided Wikipedia excerpt, w...",Mass media in Italy includes a variety of onli...,MYmovies.it features a comprehensive database ...,MYmovies.it provides international news relate...,MYmovies.it is particularly popular among Ital...,MYmovies.it is known for its streaming platfor...,MYmovies.it is the 56th most popular website a...,E,"According to the provided Wikipedia excerpt, w...",0
4,What is the certification status of The Smashi...,Vieuphoria is a long form music video by Ameri...,Vieuphoria was certified diamond by the RIAA i...,Vieuphoria was certified gold by the RIAA in l...,Vieuphoria was certified silver by the RIAA in...,Vieuphoria was not certified by the RIAA.,Vieuphoria was certified platinum by the RIAA ...,B,What is the certification status of The Smashi...,2


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first',
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]

    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

In [None]:
%%time

final_dfs = pd.DataFrame()
cv_list = []
for fold in range(3):

      df_train =  train_df[train_df.fold==fold].copy()
      df_val = df_valid
      print(f'{df_train.shape}, {df_val.shape}')

      train_set = df_train[['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer']]
      valid_set = df_val[['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer']]

      train_set = Dataset.from_pandas(train_set).remove_columns(["__index_level_0__"])
      valid_set = Dataset.from_pandas(valid_set)

      tokenized_train = train_set.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
      tokenized_valid = valid_set.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])

      training_args = TrainingArguments(
          warmup_ratio=0.8,
          learning_rate=2e-6,
          per_device_train_batch_size=3,
          per_device_eval_batch_size=3,
          num_train_epochs=3,
          report_to='none',
          output_dir = f'./output/{fold}/checkpoints_{VER}',
          overwrite_output_dir=True,
          fp16=True,
          evaluation_strategy='epoch',
          save_strategy="epoch",
          metric_for_best_model='map@3',
          lr_scheduler_type='cosine',
          save_total_limit=1,
          seed=42)

      model = AutoModelForMultipleChoice.from_pretrained(MODEL, ignore_mismatched_sizes=True)

      if FREEZE_EMBEDDINGS:
        print('Freezing embeddings.')
        for param in model.deberta.embeddings.parameters():
            param.requires_grad = False
      if FREEZE_LAYERS>0:
          print(f'Freezing {FREEZE_LAYERS} layers.')
          for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
              for param in layer.parameters():
                  param.requires_grad = False

      trainer = Trainer(
          model=model,
          args=training_args,
          tokenizer=tokenizer,
          data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
          compute_metrics = compute_metrics,
          train_dataset=tokenized_train,
          eval_dataset=tokenized_valid
      )

      trainer.train()
      trainer.save_model(f'./output/{fold}/model_v{VER}')

      del model, trainer, tokenized_train, tokenized_valid, train_set, valid_set, df_train, df_val
      gc.collect()
      # break

(4993, 10), (2051, 10)


Map:   0%|          | 0/4993 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing embeddings.
Freezing 18 layers.


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Map@3
1,1.6115,1.608698,0.60247
2,1.6097,1.061214,0.782626
3,1.1522,0.804309,0.815862


(4959, 10), (2051, 10)


Map:   0%|          | 0/4959 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing embeddings.
Freezing 18 layers.


Epoch,Training Loss,Validation Loss,Map@3
1,1.6097,1.609348,0.475215
2,1.6081,1.149782,0.771087
3,1.2007,0.928154,0.785714


(5019, 10), (2051, 10)


Map:   0%|          | 0/5019 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing embeddings.
Freezing 18 layers.


Epoch,Training Loss,Validation Loss,Map@3
1,1.6132,1.60927,0.527222
2,1.3925,1.07009,0.753454
3,1.168,0.985096,0.758085


CPU times: user 1h 58min 23s, sys: 29min 9s, total: 2h 27min 33s
Wall time: 2h 17min 55s
