In [1]:
from __future__ import print_function
from __future__ import division
import warnings
warnings.simplefilter(action='ignore')
import json, os, argparse
import string
import numpy as np
import tqdm
from tqdm import tqdm_notebook as tqdm
import h5py
import urllib.request as urllib
from keras.utils.data_utils import get_file
from os import path

def get_glove_file_path():
    SERVER = 'http://nlp.stanford.edu/data/'
    VERSION = 'glove.840B.300d'

    origin = '{server}{version}.zip'.format(server=SERVER, version=VERSION)
    cache_dir = path.join(path.abspath(path.dirname(__file__)), 'data')

    fname = '/tmp/glove.zip'
    get_file(fname,
             origin=origin,
             cache_dir=cache_dir,
             cache_subdir='',
             extract=True)

    # Remove unnecessary .zip file and keep only extracted .txt version
    os.remove(fname)
    return path.join(cache_dir, VERSION) + '.txt'
glove_file_path = 'Data/glove.840B.300d.txt'
if not path.exists(glove_file_path):
    glove_file_path = get_glove_file_path()
    

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
urllib.urlretrieve(url+'train-v1.1.json', './Data/train-v1.1.json')
urllib.urlretrieve(url+'dev-v1.1.json', './Data/dev-v1.1.json')

with open('./Data/train-v1.1.json') as json_data:
    d = json.load(json_data)
with open('./Data/dev-v1.1.json') as json_data:
    d1 = json.load(json_data)


Using TensorFlow backend.


In [17]:
print('Data upload completed successfully!')
print('Start preporation...')

Data upload completed successfully!
Start preporation...


In [36]:
emb_size, emb_dim = 2195876, 300
f = open(glove_file_path)
word_dict, ind_dict = {}, {}
embedding_matrix = np.zeros((emb_size, emb_dim))
i = 1
for lines in tqdm(f):
    conv_err = 0
    line = lines.split()
    word = line[0]
    try:
        f = float(line[1])
    except:
        conv_err += 1
    else:
        if word not in word_dict and i < emb_size:
            vec = np.array(line[1:], dtype='float32')
            if vec.shape[0] != emb_dim:
                continue
            word_dict[word] = i
            ind_dict[i] = word
            embedding_matrix[i] = vec
            i += 1 




In [39]:
with h5py.File('Prepared data/embeddings.h5', 'w') as hf:
        hf.create_dataset('embeddings', data=embedding_matrix)

In [45]:
def tokenize(vector):
    global i
    from nltk.tokenize import RegexpTokenizer
#     tokenizer = RegexpTokenizer('\w+|\$[\d]+|\S+')
#     tokenizer = RegexpTokenizer('\w+') 
    tokenizer = RegexpTokenizer('(\w+([\'-]\w+)*)')
    tokens = tokenizer.tokenize(vector)
    words = []
    for w in tokens:
        if w[0].isalnum():
            word = w[0].lower()
            if word in word_dict:
                words.append(word_dict[word])
    return words

def find(mylist, sublist):
    h = len(mylist)
    n = len(sublist)
    skip = {sublist[i]: n - i - 1 for i in range(n - 1)}
    i = n - 1
    while i < h:
        for j in range(n):
            if mylist[i - j] != sublist[-j - 1]:
                i += skip.get(mylist[i], n)
                break
        else:
            return i - n + 1
    return -1

def parse_data(dataset):
    context_list = []
    question_list = []
    answer_list = []
    answer_begin = []
    answer_end = []
    error = 0
    
    for article in tqdm(dataset):
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                for ans in qa['answers']:
                    # append both context and questions many times for more than one question/answer
                    ques = tokenize(qa['question'])
                    if len(ques) < 100:
                        cont = tokenize(paragraph['context'])
                        b, e = ans['answer_start'], ans['answer_start']+len(ans['text'])
                        an = tokenize(paragraph['context'][b:e])
                        begin = find(cont, an)
                        if begin > 0:
                            question_list.append(ques)
                            context_list.append(cont)
                            answer_list.append(an)
                            answer_begin.append(begin)
                            answer_end.append(begin + len(an))
    return context_list, question_list, answer_list, answer_begin, answer_end

In [47]:
print('Preparing train dataset...')
context_list, question_list, answer_list, answer_begin, answer_end = parse_data(d['data'])
print('Done!')


architecturally the school has a catholic character atop the main gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary 

In [48]:
print('Preparing dev dataset...')
dev_context_list, dev_question_list, dev_answer_list, dev_answer_begin, dev_answer_end = parse_data(d1['data'])
print('Done!')

Preparing dev dataset...





NameError: name 'dev_ind_dict' is not defined

In [51]:
context_maxlen = 300
ques_maxlen = 25
answer_maxlen = 30

In [53]:
def pad_sequence(lst, maxlen):
    array = np.zeros((len(lst), maxlen), dtype=np.int)
    for i in range(len(lst)):
        for j in range(min(len(lst[i]) - 1, maxlen)):
            array[i][j] = lst[i][j]
    return array

def padding(context, question, answer, begin, end):
    context_array = pad_sequence(context, context_maxlen)
    question_array = pad_sequence(question, ques_maxlen)
    answer_array = pad_sequence(answer, answer_maxlen)
    begin_array = np.zeros((len(begin), context_maxlen), dtype=np.int)
    end_array = np.zeros((len(end), context_maxlen), dtype=np.int)

    for i in range(len(begin)):
        begin_array[i][min(begin[i], context_maxlen - 1)] = 1 

    for i in range(len(end)):
        end_array[i][min(end[i], context_maxlen - 1)] = 1

    print('context: ', context_array.shape)
    print('question: ', question_array.shape)
    print('answer: ', answer_array.shape)
    print('answer start: ', begin_array.shape)
    print('answer end: ',end_array.shape)
    return context_array, question_array, answer_array, begin_array, end_array
def save_data(data_type):
    print('Shapes of ' + data_type + ' data')
    if data_type == 'test':
        context_array, question_array, answer_array, begin_array, end_array = \
        padding(context_list, question_list, answer_list, answer_begin, answer_end)
    if data_type == 'dev':
        context_array, question_array, answer_array,begin_array, end_array = \
        padding(dev_context_list, dev_question_list,
                dev_answer_list, dev_answer_begin, dev_answer_end)
    dr = 'Prepared data/' + data_type + '_'
    with h5py.File(dr + 'context.h5', 'w') as hf:
        hf.create_dataset('context', data=context_array)
    with h5py.File(dr + 'questions.h5', 'w') as hf:
        hf.create_dataset('questions', data=question_array)
    with h5py.File(dr + 'answers.h5', 'w') as hf:
        hf.create_dataset('answers', data=answer_array)
    with h5py.File(dr + 'begin.h5', 'w') as hf:
        hf.create_dataset('begin', data=begin_array)
    with h5py.File(dr + 'end.h5', 'w') as hf:
        hf.create_dataset('end', data=end_array)

In [54]:
save_data('test')
save_data('dev')

Shapes of test data
context:  (80949, 300)
question:  (80949, 25)
answer:  (80949, 30)
answer start:  (80949, 300)
answer end:  (80949, 300)
Shapes of dev data
context:  (32243, 300)
question:  (32243, 25)
answer:  (32243, 30)
answer start:  (32243, 300)
answer end:  (32243, 300)


In [55]:
# saving dictionaries
dr = 'Prepared data/'
np.save(dr + 'word2ind.npy', word_dict)
np.save(dr + 'ind2word.npy', ind_dict)

In [56]:
print('Success!')

Success!
