## This notebook was created and run on Kaggle and has not been tested in other environments.

In [1]:
# upgrade transformers to use save_strategy='epoch'
# !pip install transformers --upgrade

In [2]:
# installing evaluate to calculate accuracy 
# !pip install evaluate

In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import set_seed

set_seed(1122) # for reproducible code
dataset = load_dataset('json', data_files='/kaggle/input/nlpa2-dataset/A2 dataset/train.jsonl') # load data

2024-06-06 09:50:17.005284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-06 09:50:17.005415: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-06 09:50:17.280630: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset['train'][0]

{'Id': 'train-1',
 'Question': 'There is a light rain today. What happened as a result?',
 'Alternative1': 'The roots of many plants are not moistened by rain.',
 'Alternative2': 'Tourists have seen many ripples.',
 'Answer': 1}

In [5]:
from datasets import DatasetDict

# train_test_split and build HF DatasetDict
train_test_valid = dataset['train'].train_test_split(test_size=0.1)
# test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': train_test_valid['train'],
    'valid': train_test_valid['test'],
#     'valid': test_valid['train']
})

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'Answer'],
        num_rows: 13420
    })
    valid: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'Answer'],
        num_rows: 1492
    })
})

In [None]:
# load eval.json set
testset = load_dataset('json', data_files='/kaggle/input/nlpa2-dataset/A2 dataset/eval.jsonl')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# adding a throw-away lable column so that the preprocess function can be reused on the eval.jsonl set with minimal revision
_col = [1]*len(testset['train'])
testset_new = testset['train'].add_column("label", _col)

In [9]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
from transformers import AutoTokenizer, DebertaV2TokenizerFast
# load tokenizer
tokenizer_deberta = DebertaV2TokenizerFast.from_pretrained("microsoft/deberta-v3-large")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]



In [11]:
"""
Preproces & tokenize function for training dataset
"""
ans_names = ['Alternative1', 'Alternative2']

def preprocess_function(examples):
    questions = [[context] * 2 for context in examples["Question"]]
    answers = [[f"{examples[ans][i]}" for ans in ans_names] for i in range(len(examples['Answer']))]
    
    questions = sum(questions, [])
    answers = sum(answers, [])
    
    tokenized_examples = tokenizer_deberta(questions, answers, truncation=True, max_length=256) #max_length=
    
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [12]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/13420 [00:00<?, ? examples/s]

Map:   0%|          | 0/1492 [00:00<?, ? examples/s]

In [13]:
"""
Preprocess & tokenize function for test dataset (eval.jsonl)
"""

ans_names = ['Alternative1', 'Alternative2']

def preprocess_function_testset(examples):
    questions = [[context] * 2 for context in examples["Question"]]
    answers = [[f"{examples[ans][i]}" for ans in ans_names] for i in range(len(examples['label']))]
    
    questions = sum(questions, [])
    answers = sum(answers, [])
    
    tokenized_examples = tokenizer_deberta(questions, answers, truncation=True)
    
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [14]:
encoded_testset_new = testset_new.map(preprocess_function_testset, batched=True)
print(encoded_testset_new)

Map:   0%|          | 0/4261 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4261
})


In [15]:
print(encoded_dataset)
print(encoded_testset_new)

DatasetDict({
    train: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'Answer', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 13420
    })
    valid: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'Answer', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1492
    })
})
Dataset({
    features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4261
})


In [17]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs to the longest sequence length of the batch.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name)-1 for feature in features] # changed this
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [18]:
# rename "Answer" column to "label" for straight-forward training
encoded_dataset = encoded_dataset.rename_column("Answer", "label")
print(encoded_dataset)

DatasetDict({
    train: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 13420
    })
    valid: Dataset({
        features: ['Id', 'Question', 'Alternative1', 'Alternative2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1492
    })
})


In [19]:
# testing the data collator
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_dataset["train"][i].items() if k in accepted_keys} for i in range(3)]
batch = DataCollatorForMultipleChoice(tokenizer_deberta)(features)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [20]:
batch['labels']

tensor([1, 1, 1])

In [22]:
[tokenizer_deberta.decode(batch["input_ids"][1][i].tolist()) for i in range(2)]

['[CLS] This country has advanced seawater desalination technology. What happened as a result?[SEP] It has introduced deluxe shoelaces.[SEP][PAD][PAD][PAD][PAD][PAD]',
 '[CLS] This country has advanced seawater desalination technology. What happened as a result?[SEP] The arid country turns the bay into its water source.[SEP]']

In [23]:
# build model
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model_deberta = AutoModelForMultipleChoice.from_pretrained("OpenAssistant/reward-model-deberta-v3-large-v2")

config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
# training
training_args = TrainingArguments(
    output_dir="NLP_A2_3rd",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=6e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model_deberta,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer_deberta,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer_deberta),
    compute_metrics=compute_metrics,
)


trainer.train()

In [26]:
# create predictions
preds_output = trainer.predict(encoded_testset_new)

In [27]:
predictions = preds_output.predictions

In [28]:
predictions  # test

array([[ 2.4604678 ,  0.77434206],
       [ 9.338473  ,  2.7560866 ],
       [-1.4056066 , 11.298847  ],
       ...,
       [ 1.9643338 ,  8.827789  ],
       [11.742342  , -3.0449402 ],
       [-9.259441  , -1.6466047 ]], dtype=float32)

In [29]:
preds = np.argmax(predictions, axis=1)
print(len(preds))

4261


In [30]:
df = pd.DataFrame(preds)
df.columns = ['Target']

df['Target'] = df['Target']+1
df.head()

Unnamed: 0,Target
0,1
1,1
2,2
3,2
4,1


In [31]:
df.to_csv('submission.csv', index=None)

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()