In [1]:
import os
import json
import gc
import pickle

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Masking
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tqdm import tqdm_notebook as tqdm
import fasttext

In [3]:
TRAIN = "C:/Users/WorkPC/Desktop/Untitled Folder/simplified-nq-train.jsonl"
TEST = "C:/Users/WorkPC/Desktop/Untitled Folder/simplified-nq-test.jsonl"

In [4]:
def build_train(train_path, n_rows=200000, sampling_rate=15):
    with open(train_path) as f:
        processed_rows = []

        for i in tqdm(range(n_rows)):
            line = f.readline()
            if not line:
                break

            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            annotations = line['annotations'][0]

            for i, candidate in enumerate(line['long_answer_candidates']):
                label = i == annotations['long_answer']['candidate_index']

                start = candidate['start_token']
                end = candidate['end_token']

                if label or (i % sampling_rate == 0):
                    processed_rows.append({
                        'text': " ".join(text[start:end]),
                        'is_long_answer': label,
                        'question': question
                    })

        train = pd.DataFrame(processed_rows)
        
        return train

In [5]:
train_df = build_train(TRAIN)

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [6]:
def build_test(test_path):
    with open(test_path) as f:
        processed_rows = []

        for line in tqdm(f):
            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            example_id = line['example_id']

            for candidate in line['long_answer_candidates']:
                start = candidate['start_token']
                end = candidate['end_token']

                processed_rows.append({
                    'text': " ".join(text[start:end]),
                    'question': question,
                    'sequence': f'{start}:{end}'
                })

        test = pd.DataFrame(processed_rows)
    
    return test

In [7]:
test_df = build_test(TEST)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
train_df.head(10)

Unnamed: 0,text,is_long_answer,question
0,<Table> <Tr> <Td> </Td> <Td> ( hide ) This art...,False,which is the most common use of opt-in e-mail ...
1,<Tr> <Td> <Ul> <Li> Pay - per - click </Li> <L...,False,which is the most common use of opt-in e-mail ...
2,<P> Email marketing has evolved rapidly alongs...,False,which is the most common use of opt-in e-mail ...
3,<Li> Advertisers can reach substantial numbers...,False,which is the most common use of opt-in e-mail ...
4,<P> A common example of permission marketing i...,True,which is the most common use of opt-in e-mail ...
5,<P> The CAN - SPAM Act of 2003 was passed by C...,False,which is the most common use of opt-in e-mail ...
6,"<Table> <Tr> <Th_colspan=""2""> Tracy McConnell ...",False,how i.met your mother who is the mother
7,"<P> Tracy McConnell , better known as `` The M...",True,how i.met your mother who is the mother
8,"<P> In `` Bass Player Wanted '' , the Mother p...",False,how i.met your mother who is the mother
9,<Table> <Tr> <Td> Part of a series on </Td> </...,False,what type of fertilisation takes place in humans


In [9]:
test_df.head()

Unnamed: 0,text,question,sequence
0,"<Table> <Tr> <Th_colspan=""2""> High Commission ...",who is the south african high commissioner in ...,18:136
1,"<Tr> <Th_colspan=""2""> High Commission of South...",who is the south african high commissioner in ...,19:30
2,<Tr> <Th> Location </Th> <Td> Trafalgar Square...,who is the south african high commissioner in ...,34:45
3,<Tr> <Th> Address </Th> <Td> Trafalgar Square ...,who is the south african high commissioner in ...,45:59
4,<Tr> <Th> Coordinates </Th> <Td> 51 ° 30 ′ 30 ...,who is the south african high commissioner in ...,59:126


In [10]:
tokenizer = text.Tokenizer(lower=False, num_words=80000)

# creating vocabulary
for text in tqdm([train_df.text, test_df.text, train_df.question, test_df.question]):
    tokenizer.fit_on_texts(text.values)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [11]:
def compute_text_and_questions(train, test, tokenizer):
    train_text = tokenizer.texts_to_sequences(train.text.values)
    train_questions = tokenizer.texts_to_sequences(train.question.values)
    test_text = tokenizer.texts_to_sequences(test.text.values)
    test_questions = tokenizer.texts_to_sequences(test.question.values)
    
    train_text = sequence.pad_sequences(train_text, maxlen=300)
    train_questions = sequence.pad_sequences(train_questions)
    test_text = sequence.pad_sequences(test_text, maxlen=300)
    test_questions = sequence.pad_sequences(test_questions)
    
    return train_text, train_questions, test_text, test_questions

In [12]:
# Computing sequences
train_text, train_questions, test_text, test_questions = compute_text_and_questions(train_df, test_df, tokenizer)

# Training target true answer=1 / wrong answer=0
train_target = train_df.is_long_answer.astype(int).values

In [13]:
train_text[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [14]:
train_text.shape

(1921946, 300)

In [15]:
train_target.shape

(1921946,)

In [16]:
train_questions.shape

(1921946, 24)

In [17]:
test_questions.shape

(45163, 17)

In [18]:
path = 'crawl-300d-2M-subword.bin'

In [19]:
def build_embedding_matrix(tokenizer, path):
    embedding_matrix = np.zeros((tokenizer.num_words + 1, 300))
    ft_model = fasttext.load_model(path)

    for word, i in tokenizer.word_index.items():
        if i >= tokenizer.num_words - 1:
            break
        embedding_matrix[i] = ft_model.get_word_vector(word)
    
    return embedding_matrix

In [20]:
embedding_matrix = build_embedding_matrix(tokenizer, path)



In [21]:
def build_model(embedding_matrix):
    embedding = Embedding(
        *embedding_matrix.shape, 
        weights=[embedding_matrix], 
        trainable=False, 
        mask_zero=True
    )
    
    q_in = Input(shape=(None,))
    q = embedding(q_in)
    q = SpatialDropout1D(0.2)(q)
    q = Bidirectional(LSTM(100, return_sequences=True))(q)
    q = GlobalMaxPooling1D()(q)
    
    
    t_in = Input(shape=(None,))
    t = embedding(t_in)
    t = SpatialDropout1D(0.2)(t)
    t = Bidirectional(LSTM(150, return_sequences=True))(t)
    t = GlobalMaxPooling1D()(t)
    
    hidden = concatenate([q, t])
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    
    out1 = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[t_in, q_in], outputs=out1)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [22]:
model = build_model(embedding_matrix)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    24000300    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, None, 300)    0           embedding[0][0]              

In [None]:
train_history = model.fit(
    [train_text, train_questions], 
    train_target,
    epochs=2,
    validation_split=0.2,
    batch_size=1024
)