In [1]:
import json
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from tqdm.notebook import tqdm

In [2]:
with open('data/test.json', 'r', encoding='utf8') as f:
    data = json.load(f)

In [3]:
THRESHOLD = 0.5

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertForSequenceClassification.from_pretrained('./model/')

In [5]:
submission_rows = []
for entry in tqdm(data):
    q_id = entry['__id__']
    q = entry['question']
    for p in entry['paragraphs']:
        inputs = tokenizer.encode_plus(q, p['text'], add_special_tokens=True, return_tensors='tf')
        logits = model(inputs['input_ids'], token_type_ids=inputs['token_type_ids'])[0]
        softmax = tf.math.softmax(logits).numpy()[0]
        relevance_score = softmax[1]
        if relevance_score > THRESHOLD:
            submission_rows.append({'test_id': q_id, 'answer': p['id']})

HBox(children=(IntProgress(value=0, max=501), HTML(value='')))




In [6]:
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('data/submission.csv', index=False)