In [78]:
import os
import json
import numpy as np 
import pandas as pd
import re
import time
import psutil
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ProcessPoolExecutor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from multiprocessing import Process, Manager
from scipy import spatial
import warnings


# Long answer prediction

In [79]:
stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
n_answers = 1
warnings.filterwarnings("ignore")

In [11]:
def predict(json_data, annotated=False):
    # Parse JSON data
    candidates = json_data['long_answer_candidates']
    doc_tokenized = json_data['document_text'].split(' ')
    question = json_data['question_text']
    question_s = question.split(' ') 
    
    if annotated:
        ann = json_data['annotations'][0]

    # TFIDF for the document
    
    #Convert a collection of raw documents to a matrix of TF-IDF features.
    tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words)
    tfidf.fit([json_data['document_text']])  # Learn vocabulary and idf from training set.
    q_tfidf = tfidf.transform([question]).todense()  #  Transform documents to document-term matrix.

    # Find the nearest answer from candidates
    scores = []
    for i, c in enumerate(candidates):
        s, e = c['start_token'], c['end_token']
        t = ' '.join(doc_tokenized[s:e])
        t_tfidf = tfidf.transform([t]).todense()
        score = 1 - spatial.distance.cosine(q_tfidf, t_tfidf)
        scores.append(score)

    # Put the nearest condidate   
        
    ans = (np.array(candidates)[np.argsort(scores)])[-n_answers:].tolist()
    
    
    if np.max(scores) < 0.2:
        ans_long = ['-1:-1']
        ans = [{'start_token': 0, 'end_token': 0}]
    else:
        ans_long = [str(a['start_token']) + ':' + str(a['end_token']) for a in ans]
    
        
    # Preparing data for debug
    if annotated:
        ann_long_text = ' '.join(doc_tokenized[ann['long_answer']['start_token']:ann['long_answer']['end_token']])
        
    else:
        ann_long_text = ''
        
    ans_long_text = [' '.join(doc_tokenized[a['start_token']:a['end_token']]) for a in ans]
                    
    return ans_long, question, ann_long_text, ans_long_text

In [95]:
def process(json_path, chunk_num, total_list):
    
    ids = []
    anns = []
    preds = []

    # Debug data
    questions = []
    ann_texts = []
    ans_texts = []
    
    n_rows = 300_000
    num_cores = 6
    n_samples = int(n_rows/num_cores)  # number of rows for 1 chunk

    with open(json_path, 'r') as json_file:
        
        cnt = 0 + (chunk_num-1)*n_samples
        n_samples = n_samples*chunk_num
        
        for line in tqdm(json_file):

            json_data = json.loads(line)

            l_ann = str(json_data['annotations'][0]['long_answer']['start_token']) + ':' + \
                str(json_data['annotations'][0]['long_answer']['end_token'])

            l_ans, question, ann_long_text, ans_long_text = predict(json_data, annotated=True)

            ids += [str(json_data['example_id']) + '_long']*len(l_ans)

            anns += [l_ann]*len(l_ans)

            preds += l_ans

            questions += [question]*len(l_ans)

            ann_texts += [ann_long_text]*len(l_ans)

            ans_texts += ans_long_text

            cnt += 1
            if cnt%5000 == 0 and cnt < 50_001:
                print(f"computing progress: {int(cnt/500)}%")
            if cnt >= n_samples:
                break

    chunk_dict = {}
    chunk_dict['example_id'] = ids
    chunk_dict['question'] = questions
    chunk_dict['CorrectString'] = anns
    chunk_dict['CorrectText'] = ann_texts
    if len(preds) > 0:
        chunk_dict['PredictionString'] = preds
        chunk_dict['PredictionText'] = ans_texts

    total_list.append(chunk_dict)
    


# Multiprocessing

In [93]:
sum_list = list()
def multiprocessed():
    cores = psutil.cpu_count(logical=False)
    processes = []
    a = time.time()
    with Manager() as manager:
        sum_list = manager.list()  # <-- can be shared between processes.
        for i in range(0, cores):
            p = Process(target=process,args=('simplified-nq-train.jsonl', i+1, sum_list))
            processes.append(p)
        # Start the processes
        for p in processes:
            p.start()
        # Ensure all processes have finished execution
        for p in processes:
            p.join()
        
        sum_list = list(sum_list)
        b = time.time()
        print(f"the executing time using multiprocessing is: {round(b-a, 3)} sec")
        return sum_list

In [96]:
sum_list = multiprocessed()

computing progress: 10%
computing progress: 20%
computing progress: 30%
computing progress: 40%
computing progress: 50%
computing progress: 60%
computing progress: 70%
computing progress: 80%
computing progress: 90%
computing progress: 100%
the executing time using multiprocessing is: 2738.067 sec


In [97]:
def creating_df(lst):
    total_df = pd.DataFrame()
    for l in lst:
        df_chunk = pd.DataFrame.from_dict(l)
        total_df = total_df.append(df_chunk)
    return total_df

In [98]:
total_df = creating_df(sum_list)

In [99]:
f1 = f1_score(total_df['CorrectString'].values, total_df['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}') # whole condidates

F1-score: 0.1026
