# CPU only without parallel computing

In [7]:
import os
import re
import json
import time
import warnings
import numpy as np
import pandas as pd
from scipy import spatial
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [8]:
stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
warnings.filterwarnings("ignore")

In [9]:
def predict(json_data):
    # Parse JSON data
    candidates = json_data['long_answer_candidates']
    doc_tokenized = json_data['document_text'].split(' ')
    question = json_data['question_text']
    question_s = question.split(' ') 
    annotation = json_data['annotations'][0]

    # TFIDF for the document
    # Convert a collection of raw documents to a matrix of TF-IDF features.

    tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words)
    tfidf.fit([json_data['document_text']])  
    q_tfidf = tfidf.transform([question]).todense() 
    
    # Find the nearest answer from candidates using cosine distanse
    scores = []
    for i, c in enumerate(candidates):
        s, e = c['start_token'], c['end_token']
        t = ' '.join(doc_tokenized[s:e])
        t_tfidf = tfidf.transform([t]).todense()
       
        score = 1 - spatial.distance.cosine(q_tfidf, t_tfidf)
        scores.append(score)

    # Put the nearest condidate 

    ans = (np.array(candidates)[np.argsort(scores)])[-1] # dict, top condidate
    
    if np.max(scores) < 0.2:
        ans_long = '-1:-1'
        ans = {'start_token': 0, 'end_token': 0}
    else:
        ans_long = str(ans['start_token']) + ':' + str(ans['end_token'])
              
    return ans_long

In [30]:
%%time
ids, annotations, predictions = [], [], []
n_samples = 10000
with open('data/10k.json', 'r') as json_file:
    cnt = 0
    for line in tqdm(json_file):
        json_data = json.loads(line)

        annotated_answer = str(json_data['annotations'][0]['long_answer']['start_token']) + ':' + \
            str(json_data['annotations'][0]['long_answer']['end_token'])
        
        predicted_answer = predict(json_data)
        
        ids.append(str(json_data['example_id']) + '_long')
        annotations.append(annotated_answer)
        predictions.append(predicted_answer)
        
        cnt += 1
        if cnt >= n_samples:
            break

# Generating Dataframe
df = pd.DataFrame()
df['example_id'] = ids
df['CorrectString'] = annotations
df['PredictionString'] = predictions

# Evaluating
f1 = f1_score(df['CorrectString'].values, df['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

F1-score: 0.1013
CPU times: user 13min 28s, sys: 1.28 s, total: 13min 29s
Wall time: 13min 28s


# Using multiprocessing

In [10]:
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Process, Manager
import multiprocessing
import psutil

In [15]:
print(f" Logical CPU count: {psutil.cpu_count(logical=True)}")
print(f" Physical CPU count: {psutil.cpu_count(logical=False)}")

 Logical CPU count: 8
 Physical CPU count: 4


In [31]:
def process(json_path, chunk_index, total_list):
    
    ids, annotations, predictions = [], [], []
    n_rows = 10000
    num_cores = 4
    chunk_size = int(n_rows/num_cores)  # number of rows for 1 chunk
    
    with open(json_path, 'r') as json_file:
        
        cnt = 0 + (chunk_index-1)*chunk_size # starting row
        start_row = cnt
        finish_row = chunk_size*chunk_index
        
        for i, line in enumerate(json_file):
           
            if i < start_row or i > finish_row:
                continue
            
            json_data = json.loads(line)
            annotated_answer = str(json_data['annotations'][0]['long_answer']['start_token']) + ':' + \
                str(json_data['annotations'][0]['long_answer']['end_token'])

            predicted_answer = predict(json_data)

            ids.append(str(json_data['example_id']) + '_long')
            annotations.append(annotated_answer)
            predictions.append(predicted_answer)

            cnt += 1
            
            if cnt%(chunk_size/10) == 0 and cnt < (chunk_size+1):
                print(f"computing progress: {int(cnt*100/chunk_size)}%")
            
            if cnt >= finish_row:
                break

    chunk_dict = {}
    chunk_dict['example_id'] = ids
    chunk_dict['CorrectString'] = annotations
    chunk_dict['PredictionString'] = predictions
    total_list.append(chunk_dict)

In [32]:
sum_list = list()
def multiprocessed():
    cores = 4
    processes = []
    a = time.time()
    with Manager() as manager:
        sum_list = manager.list()  # <-- can be shared between processes.
        for i in range(0, cores):
            p = Process(target=process,args=('data/10k.json', i+1, sum_list))
            processes.append(p)
        # Start the processes
        for p in processes:
            p.start()
        # Ensure all processes have finished execution
        for p in processes:
            p.join()
        
        sum_list = list(sum_list)
        b = time.time()
        print(f"the executing time using multiprocessing is: {round(b-a, 3)} sec")
        return sum_list

In [33]:
sum_list = multiprocessed()

computing progress: 10%
computing progress: 20%
computing progress: 30%
computing progress: 40%
computing progress: 50%
computing progress: 60%
computing progress: 70%
computing progress: 80%
computing progress: 90%
computing progress: 100%
the executing time using multiprocessing is: 245.106 sec


In [34]:
def creating_df(lst):
    total_df = pd.DataFrame()
    for l in lst:
        df_chunk = pd.DataFrame.from_dict(l)
        total_df = total_df.append(df_chunk)
    total_df.reset_index(inplace=True, drop=True)
    return total_df
total_df = creating_df(sum_list)    
f1 = f1_score(total_df['CorrectString'].values, total_df['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}')

F1-score: 0.1013
