In [38]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain
from string import whitespace

## More careful prepocessing
##### (tried to replicate prep from FastQA paper)

In [182]:
path = '/pio/data/data/squad/glove840B/careful_prep/'

In [2]:
with io.open('/pio/data/data/squad/dev-v1.1.json', 'r', encoding='utf-8') as f:
    dev = json.load(f)

with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)

In [3]:
glove840B = np.load('/pio/data/data/glove_vec/840B/glove/glove.840B.wordlist.pkl')

In [4]:
w_to_i_840B = {glove840B[i] : i for i in xrange(len(glove840B))}

In [123]:
def tokenize(s, ans_idx=None):
    tokens = []
    buf = u''
    
    for i in range(len(s)):        
        if ans_idx is not None:
            if ans_idx == 0:
                ans_start = len(tokens)
            ans_idx -= 1
        c = s[i]
        if c.isspace():
            if buf:
                tokens.append(buf)
            buf = u''
        elif not c.isalnum():
            if buf:
                tokens.append(buf)
            tokens.append(c)
            buf = u''
        else:
            buf += c
    if buf:
        tokens.append(buf)
        
    if ans_idx is not None:
        return tokens, ans_start
    return tokens

In [134]:
data_train = []

for par in train['data']:
    for con in par['paragraphs']:
        context = con['context']
        
        for q in con['qas']:
            question = q['question']
            question_tok = tokenize(question)
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                ans_start = ans['answer_start']
                context_tok, ans_start = tokenize(context, ans_start)
                ans_end = ans_start + len(tokenize(text)) - 1
                
                answers.append(([ans_start, ans_end], text))
                   
            data_train.append([answers, question_tok, context_tok])

In [135]:
len(data_train)

87599

In [136]:
# wywalam odpowiedzi, ktorych nie mamy szans sie nauczyc (bedace fragmentami slow)
data_train = [d for d in data if u' '.join(d[2][d[0][0][0][0]:d[0][0][0][1]+1]) == u' '.join(tokenize(d[0][0][1]))]

In [137]:
len(data_train)

87269

In [147]:
data_dev = []

for par in dev['data']:
    for con in par['paragraphs']:
        context = con['context']
        
        for q in con['qas']:
            question = q['question']
            question_tok = tokenize(question)
            answers = []
            
            Id = q['id']
            
            for ans in q['answers']:
                text = ans['text']
                ans_start = ans['answer_start']
                context_tok, ans_start = tokenize(context, ans_start)
                ans_end = ans_start + len(tokenize(text)) - 1
                
                answers.append(([ans_start, ans_end], text))
                   
            data_dev.append([answers, question_tok, context_tok, Id])

In [151]:
len(data_dev)

10570

In [153]:
with open(path + 'train.pkl', 'w') as f:
    pickle.dump(data_train, f)
    
with open(path + 'dev.pkl', 'w') as f:
    pickle.dump(data_dev, f)

## Main portion

In [172]:
train_num = []
for a, q, x in data_train:
    a_num = list(range(a[0][0][0], a[0][0][1] + 1))
    q_num = [w_to_i_840B.get(w, 0) for w in q]
    x_num = [w_to_i_840B.get(w, 0) for w in x]
    train_num.append([[a_num], q_num, x_num])

In [175]:
dev_num = []
for a, q, x, _ in data_dev:
    q_num = [w_to_i_840B.get(w, 0) for w in q]
    x_num = [w_to_i_840B.get(w, 0) for w in x]
    dev_num.append([[], q_num, x_num])

In [177]:
with open(path + 'train_words.pkl', 'w') as f:
    pickle.dump(train_num, f)
    
with open(path + 'dev_words.pkl', 'w') as f:
    pickle.dump(dev_num, f)

## Binary features

In [179]:
def make_bin_feats(sample):
    q, x = sample[1:3]
    qset = set(q)
    return [w in qset for w in x]

In [180]:
train_bin_feats = map(make_bin_feats, data_train)
dev_bin_feats = map(make_bin_feats, data_dev)

In [183]:
with open(path + 'train_bin_feats.pkl', 'w') as f:
    pickle.dump(train_bin_feats, f)
    
with open(path + 'dev_bin_feats.pkl', 'w') as f:
    pickle.dump(dev_bin_feats, f)

## Characters

In [193]:
# 0 - unk
# 1 - start
# 2 - end
# 3 - not_a_word char (added later, in wikipedia negative samples)
# there are no 1s or 2s in data, so these are safe

chars = [unichr(i) for i in xrange(128)]
c_to_i = {chars[i] : i for i in xrange(128)}

def get_char_nums_for_word(w):
    return [1] + [c_to_i.get(c, 0) for c in w] + [2]

In [198]:
train_char = []

for _, q, x in data_train:
    q_char = map(get_char_nums_for_word, q)
    x_char = map(get_char_nums_for_word, x)
    train_char.append([q_char, x_char])

In [199]:
dev_char = []

for _, q, x, _ in data_dev:
    q_char = map(get_char_nums_for_word, q)
    x_char = map(get_char_nums_for_word, x)
    dev_char.append([q_char, x_char])

In [200]:
with open(path + 'train_char_ascii.pkl', 'w') as f:
    pickle.dump(train_char, f)
    
with open(path + 'dev_char_ascii.pkl', 'w') as f:
    pickle.dump(dev_char, f)