In [1]:
from typing import Optional, Union
import pandas as pd
import numpy as np
# from colorama import Fore, Back, Style
from tqdm.notebook import tqdm
import torch
from datasets import Dataset
import gc
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel, EarlyStoppingCallback
from sklearn.model_selection import KFold
import sklearn
# from torchnlp.nn import Attention #pip imstall pytorch-nlp

2023-08-31 09:44:01.582676: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-31 09:44:01.637935: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
train = pd.read_csv("/home/jisukim/online/kgl/dataset/tot_train.csv")
valid = pd.read_csv("/home/jisukim/online/kgl/dataset/tot_valid.csv")

In [4]:
train = train.fillna('None')
valid = valid.fillna('None')

In [5]:
option_to_index = {
    option: idx for idx, option in enumerate('ABCDE')
}
index_to_option = {v:k for k,v in option_to_index.items()}

In [6]:
def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]

    return tokenized_example

In [7]:
@dataclass
class DataCollatorForMultipleChoice:
  tokenizer: PreTrainedTokenizerBase
  padding: Union[bool, str, PaddingStrategy] = True
  max_length: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None

  def __call__(self, features):
      label_name = 'label' if 'label' in features[0].keys() else 'labels'
      labels = [feature.pop(label_name) for feature in features]
      batch_size = len(features)
      num_choices = len(features[0]['input_ids'])
      flattened_features = [
          [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
      ]
      flattened_features = sum(flattened_features, [])

      batch = self.tokenizer.pad(
          flattened_features,
          padding=self.padding,
          max_length=self.max_length,
          pad_to_multiple_of=self.pad_to_multiple_of,
          return_tensors='pt',
      )
      batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
      batch['labels'] = torch.tensor(labels, dtype=torch.int64)
      return batch

In [8]:
model_path = "microsoft/deberta-v3-large"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(valid)

In [11]:
tokenized_train_dataset = train_dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer','dtype'])
tokenized_valid_dataset = valid_dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer','dtype'])

Map:   0%|          | 0/28172 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3522 [00:00<?, ? examples/s]

In [12]:
model = AutoModelForMultipleChoice.from_pretrained(model_path)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassif

In [13]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

# Define your custom evaluation function
def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

## Train

In [16]:
training_args = TrainingArguments(
    output_dir='./output',
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    save_total_limit=2,
    evaluation_strategy="steps",
    warmup_ratio=0.8,
    learning_rate=2e-6,
    eval_steps=500,
    logging_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    report_to='none',
    seed=42,
    metric_for_best_model='map@3',
    save_strategy='steps'
    )

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    compute_metrics = compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [18]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Map@3
500,1.6101,1.609379,0.353682
1000,1.6152,1.609216,0.375497
1500,1.6136,1.608882,0.422866
2000,1.6119,1.608399,0.486087
2500,1.6118,1.60751,0.53109
3000,1.6099,1.603594,0.573916
3500,1.5926,1.565874,0.602877
4000,1.4914,1.365419,0.638889
4500,1.3678,1.285191,0.649394
5000,1.3195,1.24179,0.660231


TrainOutput(global_step=17610, training_loss=1.1786311624538892, metrics={'train_runtime': 10348.5174, 'train_samples_per_second': 13.612, 'train_steps_per_second': 1.702, 'total_flos': 7.198997478791544e+16, 'train_loss': 1.1786311624538892, 'epoch': 5.0})