In [1]:
import os
import json
import numpy as np 
import pandas as pd
import re

from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

from scipy import spatial

In [2]:
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./dataset_exploration.ipynb
./sample_submission.csv
./test_data.csv
./simplified-nq-train.jsonl
./simplified-nq-test.jsonl
./TF_IDF_baseline-tutorial.ipynb
./train_data.csv
./submission.csv
./TF_IDF_baseline.ipynb
./.ipynb_checkpoints/dataset_exploration-checkpoint.ipynb
./.ipynb_checkpoints/TF_IDF_baseline-checkpoint.ipynb
./.ipynb_checkpoints/TF_IDF_baseline-tutorial-checkpoint.ipynb


# Long answer prediction

In [3]:
stop_words = text.ENGLISH_STOP_WORDS.union(["book"])
n_answers = 1


In [11]:
def predict(json_data, annotated=False):
    # Parse JSON data
    candidates = json_data['long_answer_candidates']
    #candidates = [c for c in candidates if c['top_level'] == True]
    #candidates = [c for c in candidates if c]

    doc_tokenized = json_data['document_text'].split(' ')
    question = json_data['question_text']
    question_s = question.split(' ') 
    if annotated:
        ann = json_data['annotations'][0]

    # TFIDF for the document
    
    #Convert a collection of raw documents to a matrix of TF-IDF features.
    tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words)
    tfidf.fit([json_data['document_text']])  # Learn vocabulary and idf from training set.
    q_tfidf = tfidf.transform([question]).todense()  #  Transform documents to document-term matrix.

    # Find the nearest answer from candidates
    scores = []
    for i, c in enumerate(candidates):
        s, e = c['start_token'], c['end_token']
        t = ' '.join(doc_tokenized[s:e])
        t_tfidf = tfidf.transform([t]).todense()
        score = 1 - spatial.distance.cosine(q_tfidf, t_tfidf)
        scores.append(score)

    # Put the nearest condidate   
        
    ans = (np.array(candidates)[np.argsort(scores)])[-n_answers:].tolist()
    
    
    if np.max(scores) < 0.2:
        ans_long = ['-1:-1']
        ans = [{'start_token': 0, 'end_token': 0}]
    else:
        ans_long = [str(a['start_token']) + ':' + str(a['end_token']) for a in ans]
    
        
    # Preparing data for debug
    if annotated:
        ann_long_text = ' '.join(doc_tokenized[ann['long_answer']['start_token']:ann['long_answer']['end_token']])
        
    else:
        ann_long_text = ''
        
        
    ans_long_text = [' '.join(doc_tokenized[a['start_token']:a['end_token']]) for a in ans]
                    
    return ans_long, question, ann_long_text, ans_long_text

In [12]:
%%time
ids = []
anns = []
preds = []

# Debug data
questions = []
ann_texts = []
ans_texts = []

n_samples = 500

with open('simplified-nq-train.jsonl', 'r') as json_file:
    cnt = 0
    for line in tqdm(json_file):
        json_data = json.loads(line)

        l_ann = str(json_data['annotations'][0]['long_answer']['start_token']) + ':' + \
            str(json_data['annotations'][0]['long_answer']['end_token'])
        

        l_ans, question, ann_long_text, ans_long_text = predict(json_data, annotated=True)
        
        ids += [str(json_data['example_id']) + '_long']*len(l_ans)
        
    
        anns += [l_ann]*len(l_ans)
       
        
        preds += l_ans
        
        questions += [question]*len(l_ans)

        ann_texts += [ann_long_text]*len(l_ans)
       
        ans_texts += ans_long_text
        
        
        cnt += 1
        if cnt >= n_samples:
            break

            
print(f"The length of ids = {len(ids)}, \nThe length of questions is {len(questions)} \n" + \
      f"The length of anns: {len(anns)},\nThe length of ann_texts is {len(ans_texts)}")
train_ann = pd.DataFrame()
train_ann['example_id'] = ids
train_ann['question'] = questions
train_ann['CorrectString'] = anns
train_ann['CorrectText'] = ann_texts
if len(preds) > 0:
    train_ann['PredictionString'] = preds
    train_ann['PredictionText'] = ans_texts
    
train_ann.to_csv('train_data.csv', index=False)
train_ann.head(10)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)


The length of ids = 500, 
The length of questions is 500 
The length of anns: 500,
The length of ann_texts is 500
CPU times: user 23.7 s, sys: 56 ms, total: 23.7 s
Wall time: 23.7 s


Unnamed: 0,example_id,question,CorrectString,CorrectText,PredictionString,PredictionText
0,5655493461695504401_long,which is the most common use of opt-in e-mail ...,1952:2019,<P> A common example of permission marketing i...,1292:1311,<P> There are both advantages and disadvantage...
1,5328212470870865242_long,how i.met your mother who is the mother,212:310,"<P> Tracy McConnell , better known as `` The M...",212:310,"<P> Tracy McConnell , better known as `` The M..."
2,4435104480114867852_long,what type of fertilisation takes place in humans,319:438,<P> The process of fertilization involves a sp...,1164:1180,<P> The fusion of cell membranes of the second...
3,5289242154789678439_long,who had the most wins in the nfl,509:576,<P> Active quarterback Tom Brady holds the rec...,469:509,<P> The following is a list of the top Nationa...
4,5489863933082811018_long,what happened to the lost settlement of roanoke,-1:-1,,249:316,"<P> The Roanoke Colony ( / ˈroʊəˌnoʊk / ) , al..."
5,3411244446249504947_long,what are the different regions of africa and h...,-1:-1,,95:144,<P> The continent of Africa is commonly divide...
6,-2500044561429484630_long,who played mantis guardians of the galaxy 2,82:169,<P> Pom Klementieff ( born 3 May 1986 ) is a F...,82:169,<P> Pom Klementieff ( born 3 May 1986 ) is a F...
7,5611750702541347162_long,who did the voice of the magician in frosty th...,-1:-1,,580:730,"<P> One December afternoon , a little girl nam..."
8,4958098854057393062_long,what indian tribe did the acadians form friend...,562:695,<P> The survival of the Acadian settlements wa...,2861:2954,<P> The British Conquest of Acadia happened in...
9,8796576945844451825_long,what is considered the outer banks in north ca...,-1:-1,,769:911,"<P> The northern part of the Outer Banks , fro..."


In [6]:
f1 = f1_score(train_ann['CorrectString'].values, train_ann['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}') # top_level == True only condidates

F1-score: 0.1100


In [7]:
f1 = f1_score(train_ann['CorrectString'].values, train_ann['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}') # whole condidates

F1-score: 0.1100


In [8]:
f1 = f1_score(train_ann['CorrectString'].values, train_ann['PredictionString'].values, average='micro')
print(f'F1-score: {f1:.4f}')

F1-score: 0.1100


# Submission

In [9]:
%%time
ids = []
anns = []
preds = []

# Debug data
questions = []
ann_texts = []
ans_texts = []

with open('simplified-nq-test.jsonl', 'r') as json_file:
    cnt = 0
    for line in tqdm(json_file):
        json_data = json.loads(line)
        
        l_ans, s_ans, question, ann_long_text, ann_short_text, ans_long_text, ans_short_text = predict(json_data)

        ids += [str(json_data['example_id']) + '_long']*len(l_ans)
        ids.append(str(json_data['example_id']) + '_short')
        preds += l_ans
        preds.append(s_ans)
        questions += [question]*len(l_ans)
        questions.append(question)
        ans_texts += ans_long_text
        ans_texts.append(ans_short_text)
         
#         cnt += 1
#         if cnt >= n_samples:
#             break
        
subm = pd.DataFrame()
subm['example_id'] = ids
subm['question'] = questions
subm['PredictionString'] = preds
subm['PredictionText'] = ans_texts
subm.to_csv('test_data.csv', index=False)

g = subm[['example_id', 'PredictionString']].groupby('example_id').agg(lambda x: ' '.join(x) if len(x) > 1 else x).reset_index()
g.to_csv('submission.csv', index=False)

subm.head(10)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

ValueError: not enough values to unpack (expected 7, got 4)

In [10]:
g.head()


NameError: name 'g' is not defined