In [None]:
import os
import yaml
import shutil
import collections
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split

import wandb
import torch

import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
from transformers import set_seed

# custom functions from my repo
%cd QA-document-parts
from custom_functions.functions import find_labels, preprocess_training_examples, preprocess_validation_examples, postprocess_predictions 
%cd ..

from datasets import load_dataset, load_metric, Dataset
metric = load_metric("squad_v2")

!pip install huggingface_hub

In [None]:
!huggingface-cli login

In [None]:
device = "cuda"

In [None]:
wandb.login() 

In [None]:
# set seed for reproducibility
LUCKY_SEED = 42
set_seed(LUCKY_SEED)
torch.manual_seed(LUCKY_SEED)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(LUCKY_SEED)
np.random.seed(LUCKY_SEED)

## Train tokenizer on new corpus

In [None]:
data = load_dataset('json', data_files = "train.json")

In [None]:
def get_training_corpus(dataset):
    """
    define generator for tokenizer training
    """
    for start_idx in range(0, len(dataset["train"]), 1000):
        samples = dataset['train'][start_idx : start_idx + 1000]
        yield samples["text"]

In [None]:
text_corpus = get_training_corpus(data)

old_tokenizer = AutoTokenizer.from_pretrained("timpal0l/mdeberta-v3-base-squad2")
tokenizer = old_tokenizer.train_new_from_iterator(text_corpus, vocab_size = 23000)

Downloading (…)okenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
# push to hub
tokenizer.push_to_hub("mdeberta-v3-base-konturDS")

## Preprocessing

In [None]:
#stratify train_test_split
labels_for_stratify = []
for i in range(len(data['train'])):
    if data['train'][i]['label'] == 'обеспечение гарантийных обязательств':
        if data['train'][i]['extracted_part']  == {'text': [''], 'answer_start': [0], 'answer_end': [0]}:
            labels_for_stratify.append(0) # question is 'обеспечение гарантийных обязательств' and there is no answer in the context
        else:
            labels_for_stratify.append(1) # question is 'обеспечение гарантийных обязательств' and answer is provided
    else:
        if data['train'][i]['extracted_part']  == {'text': [''], 'answer_start': [0], 'answer_end': [0]}:
            labels_for_stratify.append(2) # question is 'обеспечение исполнения контракта' and there is no answer in the context
        else:
            labels_for_stratify.append(3) # question is 'обеспечение исполнения контракта' and answer is provided

In [None]:
# lets add new column and based on this values make stratified train_test_split
data["train"] = data["train"].add_column("stratify", labels_for_stratify)

train, test = train_test_split(data['train'], test_size = 0.15, stratify = data['train']['stratify'], random_state = 42)
train = Dataset.from_dict(train)
test = Dataset.from_dict(test)

train, val = train_test_split(train, test_size = 0.2, stratify = train['stratify'], random_state = 42)
train = Dataset.from_dict(train)
val = Dataset.from_dict(val)


In [None]:
data['train'] = train
data['val'] = val
data['test'] = test
data

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'extracted_part', 'stratify'],
        num_rows: 1223
    })
    val: Dataset({
        features: ['id', 'text', 'label', 'extracted_part', 'stratify'],
        num_rows: 306
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'extracted_part', 'stratify'],
        num_rows: 270
    })
})

In [None]:
def preprocess_features(data):
    """
    function to rename some features for convinience and also extract answer_start's / end's
    """
    data = data.rename_column("label", "question")
    data = data.rename_column("text", "context")
    answer_start = [x["answer_start"][0] for x in data['extracted_part']]
    answer_end = [x["answer_end"][0] for x in data['extracted_part']]
    data = data.add_column("answer_start", answer_start)
    data = data.add_column("answer_end", answer_end)
    return data

In [None]:
data['train'] = preprocess_features(data['train'])
data['val'] = preprocess_features(data['val'])
data['test'] = preprocess_features(data['test'])

In [None]:
# tokenize dataset
tokenized_dataset = data['train'].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=data["train"].column_names)

validation_features = data['val'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=data["val"].column_names)

test_features = data["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=data["test"].column_names)


## Define train function

In [42]:
def train(cfg):
    max_exact_score = 0.0

    def compute_metrics(eval_pred):
        """
        function for computing metrics
        """
        nonlocal max_exact_score

        logits, labels = eval_pred

        # Choosing possible and best answers
        final_pred = postprocess_predictions(data["val"], 
                                            validation_features, 
                                            logits,
                                            n_best_size = 20,
                                            max_answer_length = 100)
        
        formatted_pred = [{"id": k, 
                   "prediction_text": v, 
                   "no_answer_probability": 0.0} for k, v in final_pred.items()]

        references = [{"id": ex["id"], 
                       "answers": 
                      {'answer_start': ex["extracted_part"]['answer_start'], 
                       'text': ex['extracted_part']['text']}} for ex in data["val"]]

        metric_dict = metric.compute(predictions=formatted_pred, references=references)

        if max_exact_score < metric_dict['exact']:
            max_exact_score = metric_dict['exact']
        
        return {'exact_score': metric_dict['exact'], 
                'f1_score': metric_dict['f1'],
                'max_exact_score': max_exact_score}
        

    model = AutoModelForQuestionAnswering.from_pretrained(
            'timpal0l/mdeberta-v3-base-squad2',
            attention_probs_dropout_prob=cfg['attention_probs_dropout_prob'],
            hidden_dropout_prob=cfg['hidden_dropout_prob'],
            pooler_dropout=cfg['pooler_dropout']).to(device)
    
    # Define the TrainingArguments with hyperparameters from wandb .yaml config
    args = TrainingArguments(
        output_dir='/roberta/model',
        evaluation_strategy="epoch",
        num_train_epochs=cfg['count_of_epoch'],
        learning_rate=cfg['lr'],
        weight_decay=cfg['weight_decay'],
        warmup_ratio = cfg['warmup_ratio'],
        report_to='wandb',
        seed=LUCKY_SEED,
        logging_steps=20,
        save_steps=1000
    )

    # Create the Trainer
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_dataset,
        eval_dataset=validation_features,
        data_collator=default_data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    return trainer, model


## Hyperparameter optimization

In [None]:
# open config
with open('roBERTa_config.yaml') as f:
    hyper_config = yaml.load(f, Loader=yaml.FullLoader)
hyper_config

In [None]:
# create sweep
sweep_id = wandb.sweep(hyper_config, project='kontur_dsaaaa_2023')

Create sweep with ID: gxwjdfw4
Sweep URL: https://wandb.ai/baozhg/kontur_dsaaaa_2023/sweeps/gxwjdfw4


In [None]:
def run_train(config=None):
    """
    function to init sweep
    """
    with wandb.init(config=config, project='kontur_dsaaaa_2023'):
        cfg = wandb.config
        model = train(cfg)
        return model

In [None]:
# run hyperparameter optimization
wandb.agent('gxwjdfw4',
            run_train,
            count=20,
            project='kontur_dsaaaa_2023')

## Train

In [None]:
# train model with best set of hyperparameters
cfg = dict(count_of_epoch=30, batch_size=8, lr=0.00008, 
           weight_decay=0.01, warmup_ratio=0.1,
           hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3,
           pooler_dropout=0.2)

trainer, model = train(cfg)

In [None]:
# push model to hub
model.push_to_hub("mdeberta-v3-base-konturDS")

## Check quality on test dataset

In [None]:
trainer = Trainer(
        model,
        data_collator=default_data_collator,
        tokenizer=tokenizer
)

raw_predictions = trainer.predict(test_features)

In [64]:
# postprocess predictions
final_pred = postprocess_predictions(data["test"], 
                                     test_features, 
                                     raw_predictions.predictions,
                                     n_best_size = 20,
                                     max_answer_length = 100)

formatted_pred = [{"id": k, 
                   "prediction_text": v, 
                   "no_answer_probability": 0.0} for k, v in final_pred.items()]

references = [{"id": ex["id"], 
               "answers": 
               {'answer_start': ex["extracted_part"]['answer_start'], 
                'text': ex['extracted_part']['text']}} for ex in data["test"]]

  0%|          | 0/270 [00:00<?, ?it/s]

In [65]:
# 0.844 exact score
metric.compute(predictions=formatted_pred, references=references)

{'exact': 84.44444444444444,
 'f1': 97.47689267517949,
 'total': 270,
 'HasAns_exact': 84.44444444444444,
 'HasAns_f1': 97.47689267517949,
 'HasAns_total': 270,
 'best_exact': 84.44444444444444,
 'best_exact_thresh': 0.0,
 'best_f1': 97.47689267517949,
 'best_f1_thresh': 0.0}