In [None]:
import os
import yaml
import shutil
import collections
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split

import wandb
import torch

import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
from transformers import set_seed

# custom functions from my repo
%cd QA-document-parts
from custom_functions.functions import find_labels, preprocess_training_examples, preprocess_validation_examples, postprocess_predictions
%cd ..

from datasets import load_dataset, load_metric, Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("miglss/mdeberta-v3-base-konturDS")

model = AutoModelForQuestionAnswering.from_pretrained("miglss/mdeberta-v3-base-konturDS")

In [None]:
dataset_pred = load_dataset('json', data_files='test.json')

tokenized_predict = dataset_pred['train'].map(
    prepare_validation_features, 
    batched=True, 
    remove_columns=dataset_pred['train'].column_names)

In [None]:
trainer = Trainer(
        model,
        data_collator=default_data_collator,
        tokenizer=tokenizer
)

raw_predictions = trainer.predict(tokenized_predict)

In [None]:
# choosing possible and best answer
final_pred = postprocess_predictions(dataset_pred['train'],
                                     tokenized_predict,
                                     raw_predictions.predictions,
                                     n_best_size = 100,
                                     max_answer_length = 100)

# format predictions and targets for calculating metric
formatted_pred = [{"id": k, 
                   "prediction_text": v, 
                   "no_answer_probability": 0.0} for k, v in final_pred.items()]

In [None]:
extract = []
for i, pred in enumerate(formatted_pred):
    start = dataset_pred['train']['text'][i].find(pred['prediction_text'])
    end = start + len(formatted_pred[0]['prediction_text'])
    extract.append({'text': [pred['prediction_text']],
                    'answer_start': [start],
                     'answer_end': [end]})

In [None]:
final_predict = dataset_pred['train'].add_column("extracted_part", extract)
final_predict.to_json('predictions.json')