In [1]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain

### Read data

In [2]:
with open('/pio/data/data/squad/train-v1.1.json') as f:
    train = json.load(f)

In [3]:
with open('/pio/data/data/squad/dev-v1.1.json') as f:
    dev = json.load(f)

In [None]:
glove_ver = '840B'
glove_words_fname = 'glove.' + glove_ver + '.wordlist.pkl'
glove_words_path  = '/pio/data/data/glove_vec/' + glove_ver + '/glove/' + glove_words_fname
glove_words = np.load(glove_words_path)

i_to_w = glove_words
w_to_i = {v:k for (k,v) in i_to_w.items()}

def words_to_num(s):
    return map(lambda x: w_to_i.get(x, w_to_i['<unk>']), s)

In [164]:
def make_bin_feats(sample):
    q, x = sample
    qset = set(q)
    return [w in qset for w in x]

# Preprocess training set

### Grab all the question-answer pairs and create a wordlist

In [3]:
data = []
lower = lambda x: x.lower()

for par in train['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data.append([answers, question_tok, context_tok])

In [5]:
with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data, f)

In [23]:
bin_feats = map(make_bin_feats, [d[1:] for d in data])

with open('/pio/data/data/squad/train_bin_feats.pkl', 'w') as f:
    pickle.dump(bin_feats, f)

### Turn words into numbers

In [8]:
def split_on_dot(s):
    res = [[]]
    for w in s:
        res[-1].append(w)
        if w == u'.':
            res.append([])
    return res if res[-1] else res[:-1]

In [9]:
for i in xrange(len(data)):
    data[i][2] = split_on_dot(data[i][2])

In [11]:
data_num = []

for a, q, c in data:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num.append([answers, words_to_num(q), map(words_to_num, c)])

In [14]:
data_num = [[l[0], [l[1]] + l[2]] for l in data_num]
data_num = [[[t[1] for t in l[0]], l[1]] for l in data_num]

In [13]:
# There are some broken answers, because of the tokenizer (I count words instead of characters)

k = 0
for a, q in data_num:
    for w in a[0]:
        if w not in list(chain(*q[1:])):
            k += 1
k

1028

### Find answer indices on words, not characters

In [14]:
inds = []

for a, q in data_num:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)

In [15]:
for i in xrange(len(data_num)):
    data_num[i][0] = inds[i]

In [170]:
# This file has a lot of redundant parts, context is repeated for each question.
# It only slows down the initial loading.

with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data_num, f)

# Preprocess dev set

In [22]:
data_dev = []
lower = lambda x: x.lower()

for par in dev['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data_dev.append([answers, question_tok, context_tok, Id])

In [43]:
with open('/pio/data/data/squad/dev.pkl', 'w') as f:
    pickle.dump(data_dev, f)

In [37]:
dev_bin_feats = map(make_bin_feats, [d[1:3] for d in data_dev])

with open('/pio/data/data/squad/dev_bin_feats.pkl', 'w') as f:
    pickle.dump(dev_bin_feats, f)

In [122]:
for i in range(len(data_dev)):
    data_dev[i][2] = split_on_dot(data_dev[i][2])

In [125]:
data_num_dev = []

for a, q, c, _ in data_dev:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num_dev.append([answers, words_to_num(q), map(words_to_num, c)])

In [126]:
data_num_dev = [[l[0], [l[1]] + l[2]] for l in data_num_dev]
data_num_dev = [[[t[1] for t in l[0]], l[1]] for l in data_num_dev]

In [127]:
inds = []

for a, q in data_num_dev:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)
    
for i in xrange(len(data_num_dev)):
    data_num_dev[i][0] = inds[i]

In [132]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(data_num_dev, f)

## Fun with characters

In [37]:
# 0 - unk
# 1 - start
# 2 - end
# 3 - not_a_word char (added later, in wikipedia negative samplesF)
# there are no 1s or 2s in data, so these are safe

chars = [unichr(i) for i in xrange(128)]
c_to_i = {v:k for (k,v) in list(enumerate(chars))}

In [24]:
data_char = []

for _, q, x in data:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_char.append([q_char, x_char])

In [42]:
with open('/pio/data/data/squad/train_char_ascii.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [25]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_dev_char.append([q_char, x_char])

In [43]:
with open('/pio/data/data/squad/dev_char_ascii.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)

# SQuAD data with glove dictionary

### add unk to glove

In [77]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
glove_words.insert(0, '<unk>')
glove_vec = np.vstack([glove_vec.mean(axis=0), glove_vec])

In [78]:
glove_i_to_w = glove_words
glove_w_to_i = {v:k for (k,v) in list(enumerate(glove_words))}

In [85]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

### make train and dev set with glove dict

#### train

In [100]:
train_set = np.load('/pio/data/data/squad/train.pkl')

In [101]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(train_set)):
    train_set[i].append(list(chain(*train_set[i][1][1:])))
    train_set[i][1] = train_set[i][1][0]

In [103]:
for di in xrange(len(train_set)):
    for si in xrange(len(train_set[di][1:])):
        for ii in xrange(len(train_set[di][1:][si])):
            i = train_set[di][1:][si][ii]
            train_set[di][1:][si][ii] = glove_w_to_i.get(i_to_w[i], 0)

In [105]:
with open('/pio/data/data/squad/train_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_set, f)

#### dev

In [133]:
# zrobione wyżej

In [138]:
dev_set = np.load('/pio/data/data/squad/dev_with_glove_vocab.pkl')

In [139]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(dev_set)):
    dev_set[i].append(list(chain(*dev_set[i][1][1:])))
    dev_set[i][1] = dev_set[i][1][0]

In [141]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(dev_set, f)