In [1]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain

### Read data

In [2]:
with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)

In [2]:
with io.open('/pio/data/data/squad/dev-v1.1.json', 'r', encoding='utf-8') as f:
    dev = json.load(f)

### Data structure

In [4]:
dev['data'][0]['paragraphs'][0]['qas'][0]['answers']

[{u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'}]

In [95]:
len(dev['data'][0]['paragraphs'][0]['qas'][0])

3

In [96]:
dev['data'][0]['paragraphs'][0]['qas'][0].keys()

[u'question', u'id', u'answers']

In [190]:
' '.join(nltk.word_tokenize(train['data'][10]['paragraphs'][60]['context'])).split(' . ')

[u"The State Council declared a three-day period of national mourning for the quake victims starting from May 19 , 2008 ; the PRC 's National Flag and Regional Flags of Hong Kong and Macau Special Administrative Regions flown at half mast",
 u'It was the first time that a national mourning period had been declared for something other than the death of a state leader , and many have called it the biggest display of mourning since the death of Mao Zedong',
 u'At 14:28 CST on May 19 , 2008 , a week after the earthquake , the Chinese public held a moment of silence',
 u'People stood silent for three minutes while air defense , police and fire sirens , and the horns of vehicles , vessels and trains sounded',
 u"Cars and trucks on Beijing 's roads also came to a halt",
 u"People spontaneously burst into cheering `` Zhongguo jiayou ! '' ( Let 's go , China ! ) and `` Sichuan jiayou '' ( Let 's go , Sichuan ! ) afterwards ."]

# Glove

### Save glove vectors as npy

In [37]:
glove_vec = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_vec.append(np.matrix(str(' '.join(line.split()[1:]))))
        
glove_vec = np.vstack(glove_vec).astype(np.float32)

In [40]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

### Glove words

In [10]:
# create a glove wordlist

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

# Preprocess training set

### Grab all the question-answer pairs and create a wordlist

In [4]:
words = set()
data = []
lower = lambda x: x.lower()

for par in train['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data.append([answers, question_tok, context_tok])
            
words.add('<unk>')

### Turn words into numbers

In [8]:
i_to_w = dict(enumerate(words))
w_to_i = {v:k for (k,v) in i_to_w.items()}

In [109]:
def split_on_dot(s):
    res = [[]]
    for w in s:
        res[-1].append(w)
        if w == u'.':
            res.append([])
    return res if res[-1] else res[:-1]

def words_to_num(s):
    return map(lambda x: w_to_i.get(x, w_to_i['<unk>']), s)

In [10]:
for i in xrange(len(data)):
    data[i][2] = split_on_dot(data[i][2])

In [11]:
data_num = []

for a, q, c in data:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num.append([answers, words_to_num(q), map(words_to_num, c)])

In [12]:
data_num = [[l[0], [l[1]] + l[2]] for l in data_num]
data_num = [[[t[1] for t in l[0]], l[1]] for l in data_num]

In [13]:
# There are some broken answers, because of the tokenizer (I count words instead of characters)

k = 0
for a, q in data_num:
    for w in a[0]:
        if w not in list(chain(*q[1:])):
            k += 1
k

1028

### Find answer indices on words, not characters

In [14]:
inds = []

for a, q in data_num:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)

In [15]:
for i in xrange(len(data_num)):
    data_num[i][0] = inds[i]

# Predictions

In [24]:
preds = np.load('evaluate/glove_vocab/dev_with_glove_vocab_predictions_charemb_all_fixed_ep3.npz')['arr_0']

In [25]:
with io.open('evaluate/glove_vocab/dev_with_glove_vocab_predictions_charemb_all_fixed_ep3.txt', 'w', encoding='utf-8') as f:
    f.write(u'{')
    for i in xrange(len(data_dev)):
        ans = ' '.join(data_dev[i][2][preds[i][0]:preds[i][1] + 1])
        Id = data_dev[i][3]
        f.write(u'"{}": "{}"'.format(Id, ans))
        if i < len(data_dev) - 1:
            f.write(u', ')
    f.write(u'}')

# Preprocess dev set

In [3]:
words_dev = set()
data_dev = []
lower = lambda x: x.lower()

for par in dev['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words_dev |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words_dev |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data_dev.append([answers, question_tok, context_tok, Id])
            
words_dev.add('<unk>')

In [16]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = [w if w in words else '<unk>' for w in data_dev[i][2]]

In [122]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = split_on_dot(data_dev[i][2])

In [123]:
def words_to_num(s):
    return map(lambda x: glove_w_to_i.get(x, 0), s)

In [125]:
data_num_dev = []

for a, q, c, _ in data_dev:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num_dev.append([answers, words_to_num(q), map(words_to_num, c)])

In [126]:
data_num_dev = [[l[0], [l[1]] + l[2]] for l in data_num_dev]
data_num_dev = [[[t[1] for t in l[0]], l[1]] for l in data_num_dev]

In [127]:
inds = []

for a, q in data_num_dev:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)
    
for i in xrange(len(data_num_dev)):
    data_num_dev[i][0] = inds[i]

In [49]:
# data_num_dev = [[d[0]] + map(words_to_num, d[1:]) for d in data_dev]

In [132]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(data_num_dev, f)

### Get Glove vectors for words in data

In [6]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])

In [7]:
glove_i_to_w = glove_words
glove_w_to_i = {v:k for (k,v) in list(enumerate(glove_words))}

In [68]:
embs = np.zeros((len(w_to_i), 300), dtype=np.float32)
embs.shape

(102802, 300)

In [24]:
known_inds = [i for i in xrange(len(w_to_i)) if i_to_w[i] in glove_w_to_i]
len(known_inds)

73351

In [25]:
s_known = set(known_inds)
unknown_inds = [i for i in xrange(len(w_to_i)) if i not in s_known]
s_unknown = set(unknown_inds)

In [26]:
dev_num = np.load('/pio/data/data/squad/dev_with_training_vocab.pkl')

In [52]:
train_num = np.load('/pio/data/data/squad/train.pkl')

In [54]:
not_in_dev = words - words_dev

In [28]:
w_to_i['<unk>']

4445

In [59]:
for di in xrange(len(train_num)):
    for si in xrange(len(train_num[di][1])):
        for wi in xrange(len(train_num[di][1][si])):
            w = train_num[di][1][si][wi]
            if i_to_w[w] in not_in_dev:
                train_num[di][1][si][wi] = 4445

In [62]:
with open('/pio/data/data/squad/train_with_unks.pkl', 'w') as f:
    pickle.dump(train_num, f)

In [78]:
embs[known_inds] = glove_vec[[glove_w_to_i[i_to_w[i]] for i in known_inds]]
embs[unknown_inds] = L.init.Normal()((len(unknown_inds), 300))

## Some stats

In [55]:
len([w for d in data_dev for w in list(chain(*d[1:3])) if w in w_to_i])

1569040

In [36]:
# percentage of <unk> in dev
51889. / 1620929

0.03201188947819429

In [49]:
# % of dev set in train vocabulary
1569040. / 1620929

0.9679881105218057

In [47]:
# % of dev set in glove
1600564. / 1620929

0.9874362171322741

In [46]:
# <unk> in dev questions
2632. / 120950

0.021761058288548987

In [58]:
# no-devs in train
1070257. / 13061165

0.08194192478236054

In [40]:
# no-gloves in train
164757. / 13061165

0.01261426526653633

### Save processed data

In [165]:
sorted_words = map(lambda x: x[0], sorted(w_to_i.items(), key=lambda x: x[1]))

In [166]:
with io.open('/pio/data/data/squad/wordlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_words:
        f.write(unicode(w + '\n'))

In [170]:
# This file has a lot of redundant parts, context is repeated for each question.
# It only slows down the initial loading.

with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data_num, f)

In [2]:
data = np.load('/pio/data/data/squad/train.pkl')

In [12]:
w_to_i = {}
idx = 0

with io.open('/pio/data/data/squad/train_wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        w_to_i[line[:-1]] = idx
        idx += 1
        
i_to_w = {v:k for (k,v) in w_to_i.items()}

In [13]:
lens = np.array(map(lambda x: len(x[1]), data))

In [27]:
def show_data(idx):
    for s in data[idx][1]:
        print ' '.join([i_to_w[x] for x in s])

In [28]:
show_data(60023)

what is the largest hottest continuously large area worldwide ?
the sky is usually clear above the desert and the sunshine duration is extremely high everywhere in the sahara .
most of the desert enjoys more than 3,600 h of bright sunshine annually or over 82 % of the time and a wide area in the eastern part experiences in excess of 4,000 h of bright sunshine a year or over 91 % of the time , and the highest values are very close to the theoretical maximum value .
a value of 4,300 h or 98 % of the time would be recorded in upper egypt ( aswan , luxor ) and in the nubian desert ( wadi halfa ) .
the annual average direct solar irradiation is around 2,800 kwh/ ( m2 year ) in the great desert .
the sahara has a huge potential for solar energy production .
the constantly high position of the sun , the extremely low relative humidity , the lack of vegetation and rainfall make the great desert the hottest continuously large area worldwide and certainly the hottest place on earth during summer

In [14]:
print max(lens)
np.bincount(lens)

28


array([    0,     0,  1665,  6015, 12958, 18663, 16891, 12176,  7727,
        4709,  2711,  1677,   859,   636,   352,   223,   166,    60,
          39,    24,     9,     0,     5,    19,     5,     0,     0,
           0,    10])

## Fun with characters

In [None]:
# i_to_c = list(glove_chars)
# c_to_i = {v:k for (k,v) in i_to_c.items()}

In [None]:
# data powinno być bezpośrednio po wykonaniu okienka, w którym jest inicjowane words

In [18]:
# chars = {c for d in data for s in d[1:] for w in s for c in w}

In [46]:
# chars_dev = {c for d in data_dev for s in d[1:3] for w in s for c in w}

In [42]:
# chars.add('<unk>')

In [37]:
# 0 - unk
# 1 - start
# 2 - end
# there are no 1s or 2s in data, so these are safe

chars = [unichr(i) for i in xrange(128)]

In [38]:
i_to_c = chars
c_to_i = {v:k for (k,v) in list(enumerate(chars))}

In [24]:
data_char = []

for _, q, x in data:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_char.append([q_char, x_char])

In [25]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_dev_char.append([q_char, x_char])

In [41]:
data_dev[0][1:3]

[[u'which',
  u'nfl',
  u'team',
  u'represented',
  u'the',
  u'afc',
  u'at',
  u'super',
  u'bowl',
  u'50',
  u'?'],
 [u'super',
  u'bowl',
  u'50',
  u'was',
  u'an',
  u'american',
  u'football',
  u'game',
  u'to',
  u'determine',
  u'the',
  u'champion',
  u'of',
  u'the',
  u'national',
  u'football',
  u'league',
  u'(',
  u'nfl',
  u')',
  u'for',
  u'the',
  u'2015',
  u'season',
  u'.',
  u'the',
  u'american',
  u'football',
  u'conference',
  u'(',
  u'afc',
  u')',
  u'champion',
  u'denver',
  u'broncos',
  u'defeated',
  u'the',
  u'national',
  u'football',
  u'conference',
  u'(',
  u'nfc',
  u')',
  u'champion',
  u'carolina',
  u'panthers',
  u'24\u201310',
  u'to',
  u'earn',
  u'their',
  u'third',
  u'super',
  u'bowl',
  u'title',
  u'.',
  u'the',
  u'game',
  u'was',
  u'played',
  u'on',
  u'february',
  u'7',
  u',',
  u'2016',
  u',',
  u'at',
  u'levi',
  u"'s",
  u'stadium',
  u'in',
  u'the',
  u'san',
  u'francisco',
  u'bay',
  u'area',
  u'at',
  u'

In [39]:
data_dev_char[0]

[[[1, 119, 104, 105, 99, 104, 2],
  [1, 110, 102, 108, 2],
  [1, 116, 101, 97, 109, 2],
  [1, 114, 101, 112, 114, 101, 115, 101, 110, 116, 101, 100, 2],
  [1, 116, 104, 101, 2],
  [1, 97, 102, 99, 2],
  [1, 97, 116, 2],
  [1, 115, 117, 112, 101, 114, 2],
  [1, 98, 111, 119, 108, 2],
  [1, 53, 48, 2],
  [1, 63, 2]],
 [[1, 115, 117, 112, 101, 114, 2],
  [1, 98, 111, 119, 108, 2],
  [1, 53, 48, 2],
  [1, 119, 97, 115, 2],
  [1, 97, 110, 2],
  [1, 97, 109, 101, 114, 105, 99, 97, 110, 2],
  [1, 102, 111, 111, 116, 98, 97, 108, 108, 2],
  [1, 103, 97, 109, 101, 2],
  [1, 116, 111, 2],
  [1, 100, 101, 116, 101, 114, 109, 105, 110, 101, 2],
  [1, 116, 104, 101, 2],
  [1, 99, 104, 97, 109, 112, 105, 111, 110, 2],
  [1, 111, 102, 2],
  [1, 116, 104, 101, 2],
  [1, 110, 97, 116, 105, 111, 110, 97, 108, 2],
  [1, 102, 111, 111, 116, 98, 97, 108, 108, 2],
  [1, 108, 101, 97, 103, 117, 101, 2],
  [1, 40, 2],
  [1, 110, 102, 108, 2],
  [1, 41, 2],
  [1, 102, 111, 114, 2],
  [1, 116, 104, 101, 2],
  [

In [52]:
sorted_chars = map(lambda x: x[0], sorted(c_to_i.items(), key=lambda x: x[1]))

In [55]:
with io.open('/pio/data/data/squad/train_charlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_chars:
        f.write(unicode(w + '\n'))

In [42]:
with open('/pio/data/data/squad/train_char_ascii.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [43]:
with open('/pio/data/data/squad/dev_char_ascii.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)

# SQuAD data with glove dictionary

### add unk to glove

In [77]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
glove_words.insert(0, '<unk>')
glove_vec = np.vstack([glove_vec.mean(axis=0), glove_vec])

In [78]:
glove_i_to_w = glove_words
glove_w_to_i = {v:k for (k,v) in list(enumerate(glove_words))}

In [85]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

### make train and dev set with glove dict

#### train

In [100]:
train_set = np.load('/pio/data/data/squad/train.pkl')

In [101]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(train_set)):
    train_set[i].append(list(chain(*train_set[i][1][1:])))
    train_set[i][1] = train_set[i][1][0]

In [103]:
for di in xrange(len(train_set)):
    for si in xrange(len(train_set[di][1:])):
        for ii in xrange(len(train_set[di][1:][si])):
            i = train_set[di][1:][si][ii]
            train_set[di][1:][si][ii] = glove_w_to_i.get(i_to_w[i], 0)

In [105]:
with open('/pio/data/data/squad/train_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_set, f)

#### dev

In [133]:
# zrobione wyżej

In [136]:
train_set = np.load('/pio/data/data/squad/train_with_glove_vocab.pkl')

In [138]:
dev_set = np.load('/pio/data/data/squad/dev_with_glove_vocab.pkl')

In [139]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(dev_set)):
    dev_set[i].append(list(chain(*dev_set[i][1][1:])))
    dev_set[i][1] = dev_set[i][1][0]

In [141]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(dev_set, f)

### characters

In [None]:
# data powinno być bezpośrednio po wykonaniu okienka, w którym jest inicjowane words

In [20]:
glove_chars = sorted({c for w in glove_w_to_i for c in w})
glove_chars.insert(0, '<unk_char>')

In [25]:
glove_i_to_c = glove_chars
glove_c_to_i = {v:k for (k,v) in list(enumerate(glove_chars))}

In [22]:
with io.open('/pio/data/data/glove_vec/6B/glove.6B.charlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_chars:
        f.write(unicode(w + '\n'))

In [29]:
data_char = []

for _, q, x in data:
    q_char = [[glove_c_to_i.get(c, 0) for c in w] for w in q]
    x_char = [[glove_c_to_i.get(c, 0) for c in w] for w in x]
    data_char.append([q_char, x_char])

In [30]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[glove_c_to_i.get(c, 0) for c in w] for w in q]
    x_char = [[glove_c_to_i.get(c, 0) for c in w] for w in x]
    data_dev_char.append([q_char, x_char])

In [None]:
# TODO?
# here i wanted to add artificial tokens for beginning and end of a word (as in https://arxiv.org/abs/1508.06615)

In [33]:
with open('/pio/data/data/squad/train_char_with_glove_alphabet.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [34]:
with open('/pio/data/data/squad/dev_char_with_glove_alphabet.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)