In [1]:
# verified

import json
import sys
import os

from os.path import join

sys.path.append('../..')
from squad_tools import load_dict
from my_tokenize import tokenize_with_ans_idx as tokenize

## SQuAD data preprocessing

In [36]:
# include glove version?

out_path = '/pio/data/data/squad/test'
glove_path = '/pio/data/data/glove_vec/6B/glove.6B.300d.txt'
squad_path = '/pio/data/data/squad'

if not os.path.exists(out_path):
    os.makedirs(out_path)

lower = True
def lower_if_needed(s):
    if lower:
        s = s.lower()
    return s

In [45]:
with open(join(squad_path, 'dev-v1.1.json')) as f:
    dev = json.load(f)

with open(join(squad_path, 'train-v1.1.json')) as f:
    train = json.load(f)
    
wordlist = load_dict(glove_path)
    
w_to_i = {w : i for i, w in enumerate(wordlist)}

In [50]:
def prepare_data(json_data, withId=False):
    data = []

    for par in json_data['data']:
        for con in par['paragraphs']:
            context = lower_if_needed(con['context'])

            for q in con['qas']:
                question = lower_if_needed(q['question'])
                question_tok = tokenize(question)[0]
                answers = []
                
                Id = q['id']

                for ans in q['answers']:
                    text = lower_if_needed(ans['text'])
                    ans_start = ans['answer_start']
                    context_tok, ans_start = tokenize(context, ans_start)
                    ans_end = ans_start + len(tokenize(text)[0]) - 1

                    answers.append(([ans_start, ans_end], text))

                data.append([answers, question_tok, context_tok])
                if withId:
                    data[-1].append(Id)
    return data

In [51]:
data_train = prepare_data(train)
data_dev = prepare_data(dev, withId=True)
len(data_train), len(data_dev)

(87599, 10570)

In [52]:
# throw out the questions with answers we can't possibly learn (the ones that aren't whole words)
data_train = [d for d in data_train if 
              u' '.join(d[2][d[0][0][0][0]:d[0][0][0][1]+1]) == \
              u' '.join(tokenize(d[0][0][1])[0])]
len(data_train)

87269

In [53]:
with open(join(out_path, 'train.json'), 'w') as f:
    json.dump(data_train, f)
    
with open(join(out_path, 'dev.json'), 'w') as f:
    json.dump(data_dev, f)

## Main portion

In [2]:
def get_word_nums(s):
    return [w_to_i.get(w, 0) for w in s]

train_num = []
for a, q, x in data_train:
    a_num = list(range(a[0][0][0], a[0][0][1] + 1))
    q_num = get_word_nums(q)
    x_num = get_word_nums(x)
    train_num.append([[a_num], q_num, x_num])
    
dev_num = []
for a, q, x, _ in data_dev:
    q_num = get_word_nums(q)
    x_num = get_word_nums(x)
    dev_num.append([[], q_num, x_num])

In [56]:
with open(join(out_path, 'train_words.json'), 'w') as f:
    json.dump(train_num, f)
    
with open(join(out_path, 'dev_words.json'), 'w') as f:
    json.dump(dev_num, f)

## Binary features

In [57]:
def make_bin_feats(sample):
    q, x = sample[1:3]
    qset = set(q)
    return [w in qset for w in x]

In [58]:
train_bin_feats = map(make_bin_feats, data_train)
dev_bin_feats = map(make_bin_feats, data_dev)

In [59]:
with open(join(out_path, 'train_bin_feats.json'), 'wb') as f:
    json.dump(train_bin_feats, f)
    
with open(join(out_path, 'dev_bin_feats.json'), 'wb') as f:
    json.dump(dev_bin_feats, f)

## Characters

In [62]:
# 0 - unk
# 1 - start
# 2 - end
# 3 - not_a_word char (added later, in wikipedia negative samples)

chars = [unichr(i) for i in range(128)]
c_to_i = {chars[i] : i for i in range(128)}

def get_char_nums_for_word(w):
    return [1] + [c_to_i.get(c, 0) for c in w] + [2]

def prepare_chars(data):
    data_char = []
    for d in data:
        _, q, x = d[:3]
        q_char = map(get_char_nums_for_word, q)
        x_char = map(get_char_nums_for_word, x)
        data_char.append([q_char, x_char])
    return data_char

In [63]:
train_char = prepare_chars(data_train)
dev_char = prepare_chars(data_dev)

In [65]:
with open(join(out_path, 'train_char_ascii.json'), 'w') as f:
    json.dump(train_char, f)
    
with open(join(out_path, 'dev_char_ascii.json'), 'w') as f:
    json.dump(dev_char, f)