In [3]:
import os
import json
import numpy as np 
import pandas as pd
import re

from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import warnings

from scipy import spatial

# Long answer prediction

In [4]:
stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
n_answers = 1
warnings.filterwarnings("ignore")

In [5]:
def predict(json_data, annotated=False):
    # Parse JSON data
    candidates = json_data['long_answer_candidates']
    doc_tokenized = json_data['document_text'].split(' ')
    question = json_data['question_text']
    question_s = question.split(' ') 
    if annotated:
        ann = json_data['annotations'][0]

    # TFIDF for the document
    
    #Convert a collection of raw documents to a matrix of TF-IDF features.
    tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words)
    tfidf.fit([json_data['document_text']])  # Learn vocabulary and idf from training set.
    q_tfidf = tfidf.transform([question]).todense()  #  Transform documents to document-term matrix.

    # Find the nearest answer from candidates
    scores = []
    for i, c in enumerate(candidates):
        s, e = c['start_token'], c['end_token']
        t = ' '.join(doc_tokenized[s:e])
        t_tfidf = tfidf.transform([t]).todense()
        score = 1 - spatial.distance.cosine(q_tfidf, t_tfidf)
        scores.append(score)

    # Put the nearest condidate   
        
    ans = (np.array(candidates)[np.argsort(scores)])[-n_answers:].tolist()
    
    
    if np.max(scores) < 0.2:
        ans_long = ['-1:-1']
        ans = [{'start_token': 0, 'end_token': 0}]
    else:
        ans_long = [str(a['start_token']) + ':' + str(a['end_token']) for a in ans]
    
        
    # Preparing data for debug
    if annotated:
        ann_long_text = ' '.join(doc_tokenized[ann['long_answer']['start_token']:ann['long_answer']['end_token']])
        
    else:
        ann_long_text = ''
        
        
    ans_long_text = [' '.join(doc_tokenized[a['start_token']:a['end_token']]) for a in ans]
                    
    return ans_long, question, ann_long_text, ans_long_text

In [7]:
%%time
ids = []
anns = []
preds = []

# Debug data
questions = []
ann_texts = []
ans_texts = []

n_samples = 300_000

with open('simplified-nq-train.jsonl', 'r') as json_file:
    cnt = 0
    for line in tqdm(json_file):
        json_data = json.loads(line)

        l_ann = str(json_data['annotations'][0]['long_answer']['start_token']) + ':' + \
            str(json_data['annotations'][0]['long_answer']['end_token'])
        

        l_ans, question, ann_long_text, ans_long_text = predict(json_data, annotated=True)
        
        ids += [str(json_data['example_id']) + '_long']*len(l_ans)
        
    
        anns += [l_ann]*len(l_ans)
       
        
        preds += l_ans
        
        questions += [question]*len(l_ans)

        ann_texts += [ann_long_text]*len(l_ans)
       
        ans_texts += ans_long_text
        
        
        cnt += 1
        if cnt >= n_samples:
            break

            
train_ann = pd.DataFrame()
train_ann['example_id'] = ids
train_ann['question'] = questions
train_ann['CorrectString'] = anns
train_ann['CorrectText'] = ann_texts
if len(preds) > 0:
    train_ann['PredictionString'] = preds
    train_ann['PredictionText'] = ans_texts
    


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

CPU times: user 4h 45s, sys: 20.1 s, total: 4h 1min 5s
Wall time: 4h 39s


In [8]:
f1 = f1_score(train_ann['CorrectString'].values, train_ann['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}')

F1-score: 0.1012
