In [1]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain

### Read data

In [2]:
with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)

In [3]:
with io.open('/pio/data/data/squad/dev-v1.1.json', 'r', encoding='utf-8') as f:
    dev = json.load(f)

### Data structure

In [4]:
dev['data'][0]['paragraphs'][0]['qas'][0]['answers']

[{u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'}]

In [95]:
len(dev['data'][0]['paragraphs'][0]['qas'][0])

3

In [96]:
dev['data'][0]['paragraphs'][0]['qas'][0].keys()

[u'question', u'id', u'answers']

In [190]:
' '.join(nltk.word_tokenize(train['data'][10]['paragraphs'][60]['context'])).split(' . ')

[u"The State Council declared a three-day period of national mourning for the quake victims starting from May 19 , 2008 ; the PRC 's National Flag and Regional Flags of Hong Kong and Macau Special Administrative Regions flown at half mast",
 u'It was the first time that a national mourning period had been declared for something other than the death of a state leader , and many have called it the biggest display of mourning since the death of Mao Zedong',
 u'At 14:28 CST on May 19 , 2008 , a week after the earthquake , the Chinese public held a moment of silence',
 u'People stood silent for three minutes while air defense , police and fire sirens , and the horns of vehicles , vessels and trains sounded',
 u"Cars and trucks on Beijing 's roads also came to a halt",
 u"People spontaneously burst into cheering `` Zhongguo jiayou ! '' ( Let 's go , China ! ) and `` Sichuan jiayou '' ( Let 's go , Sichuan ! ) afterwards ."]

# Glove

### Save glove vectors as npy

In [37]:
glove_vec = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_vec.append(np.matrix(str(' '.join(line.split()[1:]))))
        
glove_vec = np.vstack(glove_vec).astype(np.float32)

In [40]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

### Glove words

In [10]:
# create a glove wordlist

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

# Preprocess wikipedia negative examples

### Choose neg example for (almost) each question

In [64]:
with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)
    
data_simple = []

for par in train['data']:    
    for con in par['paragraphs']:        
        for q in con['qas']:
            question = q['question']            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text))
                
            data_simple.append([answers[0][1], question])

In [65]:
data_simple[0]

[u'Saint Bernadette Soubirous',
 u'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?']

In [59]:
wiki_pars = np.load('/pio/data/data/squad/wiki_negative_train/wiki_train_pars_prototype.pkl')

In [None]:
wiki_pars

In [169]:
data_neg = []

for i in xrange(len(data_simple)):
    a, q = data_simple[i]
    found = False
    for _, p, _ in wiki_pars[i]:
        p = p.decode('utf8')
        if a not in p:
            found = True
            data_neg.append([i, q, p])
            break
    if not found and wiki_pars[i]:
        print "No negative example for question", i

No negative example for question 191
No negative example for question 200
No negative example for question 458
No negative example for question 764
No negative example for question 1090
No negative example for question 1653
No negative example for question 2491
No negative example for question 2626
No negative example for question 2628
No negative example for question 2976
No negative example for question 2988
No negative example for question 3040
No negative example for question 3286
No negative example for question 4510
No negative example for question 4729
No negative example for question 5195
No negative example for question 5231
No negative example for question 5411
No negative example for question 5620
No negative example for question 5686
No negative example for question 5699
No negative example for question 6329
No negative example for question 6847
No negative example for question 7242
No negative example for question 7437
No negative example for question 7455
No negative exam

In [103]:
len(data_neg), len(data_simple)

(84882, 87599)

In [115]:
data_neg[672]

[698,
 u'Beyonce was coached for her Spanish songs by which American?',
 u"Just 2 weeks into the American Civil War ,  Alton played an important part in the infamous Camp Jackson Affair ,  which in large part led to the eviction of Missouri Governor Claiborne Fox Jackson from office .  The State of Missouri's nominal neutrality was tested in a conflict over the St .  Louis Arsenal .  The Federal Government reinforced the Arsenal's tiny garrison with several detachments ,  most notably a force from the 2nd Infantry under Captain Nathaniel Lyon .  Concerned by widespread reports that Governor Jackson intended to use the Missouri Volunteer Militia to attack the Arsenal  ( and capture its 39 , 000 small arms )  ,  Secretary of War Simon Cameron ordered Lyon  ( by that time in acting command )  to evacuate the majority of the munitions to Illinois .  21 , 000 guns were secretly evacuated to Alton ,  IL on the evening of April 29 ,  1861 ."]

In [110]:
data_simple[698]

[u'Rudy Perez',
 u'Beyonce was coached for her Spanish songs by which American?']

In [143]:
with open('/pio/data/data/squad/wiki_negative_train/negative_paragraphs.pkl', 'w') as f:
    pickle.dump(data_neg, f)

### Add not_a_word token to existing pkls 

In [116]:
train_with_glove_vocab = np.load('/pio/data/data/squad/train_with_glove_vocab.pkl')
train_bin_feats = np.load('/pio/data/data/squad/train_bin_feats.pkl')
train_char_ascii = np.load('/pio/data/data/squad/train_char_ascii.pkl')

In [138]:
not_a_word_Word = 400001
not_a_word_Char = 3

for i in xrange(len(data_simple)):
    train_with_glove_vocab[i][2].append(not_a_word_Word)
    train_bin_feats[i].append(False)
    train_char_ascii[i][1].append([1, not_a_word_Char, 2])

In [141]:
train_bin_feats[0]

[False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 Fa

In [142]:
with open('/pio/data/data/squad/wiki_negative_train/train_neg_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_with_glove_vocab, f)
    
with open('/pio/data/data/squad/wiki_negative_train/train_neg_bin_feats.pkl', 'w') as f:
    pickle.dump(train_bin_feats, f)
    
with open('/pio/data/data/squad/wiki_negative_train/train_char_ascii.pkl', 'w') as f:
    pickle.dump(train_char_ascii, f)

### Add not_a_word to dev set

In [258]:
not_a_word = u'<not_a_word>'

In [257]:
dev_main = np.load('/pio/data/data/squad/dev.pkl')
dev_ascii = np.load('/pio/data/data/squad/dev_char_ascii.pkl')
dev_bin = np.load('/pio/data/data/squad/dev_bin_feats.pkl')
dev_set = np.load('/pio/data/data/squad/dev_with_glove_vocab.pkl')

In [259]:
for i in xrange(len(dev_main)):
    dev_main[i][2] = dev_main[i][2][:]

In [260]:
for i in xrange(len(dev_main)):
    dev_main[i][2].append(not_a_word)
    dev_set[i][2].append(not_a_word_Word)
    dev_bin[i].append(False)
    dev_ascii[i][1].append([1, not_a_word_Char, 2])

In [267]:
with open('/pio/data/data/squad/wiki_negative_dev/dev.pkl', 'w') as f:
    pickle.dump(dev_main, f)
    
with open('/pio/data/data/squad/wiki_negative_dev/dev_char_ascii.pkl', 'w') as f:
    pickle.dump(dev_ascii, f)

with open('/pio/data/data/squad/wiki_negative_dev/dev_bin_feats.pkl', 'w') as f:
    pickle.dump(dev_bin, f)
    
with open('/pio/data/data/squad/wiki_negative_dev/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(dev_set, f)

In [222]:
dev_main[0][2]

[u'super',
 u'bowl',
 u'50',
 u'was',
 u'an',
 u'american',
 u'football',
 u'game',
 u'to',
 u'determine',
 u'the',
 u'champion',
 u'of',
 u'the',
 u'national',
 u'football',
 u'league',
 u'(',
 u'nfl',
 u')',
 u'for',
 u'the',
 u'2015',
 u'season',
 u'.',
 u'the',
 u'american',
 u'football',
 u'conference',
 u'(',
 u'afc',
 u')',
 u'champion',
 u'denver',
 u'broncos',
 u'defeated',
 u'the',
 u'national',
 u'football',
 u'conference',
 u'(',
 u'nfc',
 u')',
 u'champion',
 u'carolina',
 u'panthers',
 u'24\u201310',
 u'to',
 u'earn',
 u'their',
 u'third',
 u'super',
 u'bowl',
 u'title',
 u'.',
 u'the',
 u'game',
 u'was',
 u'played',
 u'on',
 u'february',
 u'7',
 u',',
 u'2016',
 u',',
 u'at',
 u'levi',
 u"'s",
 u'stadium',
 u'in',
 u'the',
 u'san',
 u'francisco',
 u'bay',
 u'area',
 u'at',
 u'santa',
 u'clara',
 u',',
 u'california',
 u'.',
 u'as',
 u'this',
 u'was',
 u'the',
 u'50th',
 u'super',
 u'bowl',
 u',',
 u'the',
 u'league',
 u'emphasized',
 u'the',
 u'``',
 u'golden',
 u'annive

### Create pkls for negative examples

In [146]:
from nltk import word_tokenize

In [171]:
for i in xrange(len(data_neg)):
    data_neg[i][1] = map(lower, word_tokenize(data_neg[i][1]))
    data_neg[i][2] = map(lower, word_tokenize(data_neg[i][2]))

In [172]:
for i in xrange(len(data_neg)):
    data_neg[i][2].append(not_a_word)

In [173]:
for i in xrange(len(data_neg)):
    data_neg[i][0] = [[len(data_neg[i][2]) - 1]]

In [174]:
train_wiki_neg_bin_feats = []

for _, q, c in data_neg:
    train_wiki_neg_bin_feats.append(make_bin_feats([q, c]))

In [179]:
train_wiki_neg_ascii_chars = []

chars = [unichr(i) for i in xrange(128)]
i_to_c = chars
c_to_i = {v:k for (k,v) in list(enumerate(chars))}

for _, q, x in data_neg:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x[:-1]] + [[1, 3, 2]]
    train_wiki_neg_ascii_chars.append([q_char, x_char])

In [192]:
with open('/pio/data/data/squad/wiki_negative_train/train_neg_bin_feats.pkl', 'w') as f:
    pickle.dump(train_wiki_neg_bin_feats, f)
    
with open('/pio/data/data/squad/wiki_negative_train/train_neg_char_ascii.pkl', 'w') as f:
    pickle.dump(train_wiki_neg_ascii_chars, f)

In [204]:
train_neg_with_glove_vocab = []

for a, q, x in data_neg:
    q_num = [glove_w_to_i.get(w, 0) for w in q]
    x_num = [glove_w_to_i.get(w, 0) for w in x]
    train_neg_with_glove_vocab.append([a, q_num, x_num])

In [206]:
with open('/pio/data/data/squad/wiki_negative_train/train_neg_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_neg_with_glove_vocab, f)

In [207]:
a = [len(w[2]) for w in train_neg_with_glove_vocab]

In [210]:
sum(1 for i in a if i > 400)

875

In [2]:
train_wiki_neg_bin_feats = np.load('/pio/data/data/squad/wiki_negative_train/train_neg_bin_feats.pkl')
train_wiki_neg_ascii_chars = np.load('/pio/data/data/squad/wiki_negative_train/train_neg_char_ascii.pkl')
train_neg_with_glove_vocab = np.load('/pio/data/data/squad/wiki_negative_train/train_neg_with_glove_vocab.pkl')

In [5]:
max(len(y) for x in train_wiki_neg_ascii_chars for y in x[1])

146

In [6]:
too_long = []

for i, (_, x) in enumerate(train_wiki_neg_ascii_chars):
    for w in x:
        if len(w) > 35:
            too_long.append(i)
            break

In [8]:
train_wiki_neg_bin_feats_cut = []
train_wiki_neg_ascii_chars_cut = []
train_neg_with_glove_vocab_cut = []

for i in xrange(len(train_neg_with_glove_vocab)):
    if i not in too_long:
        train_wiki_neg_bin_feats_cut.append(train_wiki_neg_bin_feats[i])
        train_wiki_neg_ascii_chars_cut.append(train_wiki_neg_ascii_chars[i])
        train_neg_with_glove_vocab_cut.append(train_neg_with_glove_vocab[i])

In [9]:
max(len(y) for x in train_wiki_neg_ascii_chars_cut for y in x[1])

34

In [10]:
len(train_neg_with_glove_vocab_cut)

84833

In [11]:
with open('/pio/data/data/squad/wiki_negative_train/train_neg_bin_feats.pkl', 'w') as f:
    pickle.dump(train_wiki_neg_bin_feats_cut, f)
    
with open('/pio/data/data/squad/wiki_negative_train/train_neg_char_ascii.pkl', 'w') as f:
    pickle.dump(train_wiki_neg_ascii_chars_cut, f)
    
with open('/pio/data/data/squad/wiki_negative_train/train_neg_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_neg_with_glove_vocab_cut, f)

# Preprocess training set

### Grab all the question-answer pairs and create a wordlist

In [3]:
words = set()
data = []
lower = lambda x: x.lower()

for par in train['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data.append([answers, question_tok, context_tok])
            
words.add('<unk>')

In [5]:
with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data, f)

In [6]:
data[0]

[[(515, [u'saint', u'bernadette', u'soubirous'])],
 [u'to',
  u'whom',
  u'did',
  u'the',
  u'virgin',
  u'mary',
  u'allegedly',
  u'appear',
  u'in',
  u'1858',
  u'in',
  u'lourdes',
  u'france',
  u'?'],
 [u'architecturally',
  u',',
  u'the',
  u'school',
  u'has',
  u'a',
  u'catholic',
  u'character',
  u'.',
  u'atop',
  u'the',
  u'main',
  u'building',
  u"'s",
  u'gold',
  u'dome',
  u'is',
  u'a',
  u'golden',
  u'statue',
  u'of',
  u'the',
  u'virgin',
  u'mary',
  u'.',
  u'immediately',
  u'in',
  u'front',
  u'of',
  u'the',
  u'main',
  u'building',
  u'and',
  u'facing',
  u'it',
  u',',
  u'is',
  u'a',
  u'copper',
  u'statue',
  u'of',
  u'christ',
  u'with',
  u'arms',
  u'upraised',
  u'with',
  u'the',
  u'legend',
  u'``',
  u'venite',
  u'ad',
  u'me',
  u'omnes',
  u"''",
  u'.',
  u'next',
  u'to',
  u'the',
  u'main',
  u'building',
  u'is',
  u'the',
  u'basilica',
  u'of',
  u'the',
  u'sacred',
  u'heart',
  u'.',
  u'immediately',
  u'behind',
  u'the

### Make binary features

In [164]:
def make_bin_feats(sample):
    q, x = sample
    qset = set(q)
    return [w in qset for w in x]

In [8]:
data = [d[1:] for d in data]

In [16]:
bin_feats = map(make_bin_feats, data)

In [23]:
with open('/pio/data/data/squad/train_bin_feats.pkl', 'w') as f:
    pickle.dump(bin_feats, f)

In [33]:
data_dev = [d[1:3] for d in data_dev]

In [35]:
dev_bin_feats = map(make_bin_feats, data_dev)

In [37]:
with open('/pio/data/data/squad/dev_bin_feats.pkl', 'w') as f:
    pickle.dump(dev_bin_feats, f)

### Turn words into numbers

In [7]:
i_to_w = dict(enumerate(words))
w_to_i = {v:k for (k,v) in i_to_w.items()}

In [8]:
def split_on_dot(s):
    res = [[]]
    for w in s:
        res[-1].append(w)
        if w == u'.':
            res.append([])
    return res if res[-1] else res[:-1]

def words_to_num(s):
    return map(lambda x: w_to_i.get(x, w_to_i['<unk>']), s)

In [9]:
for i in xrange(len(data)):
    data[i][2] = split_on_dot(data[i][2])

In [10]:
data[0]

[[(515, [u'saint', u'bernadette', u'soubirous'])],
 [u'to',
  u'whom',
  u'did',
  u'the',
  u'virgin',
  u'mary',
  u'allegedly',
  u'appear',
  u'in',
  u'1858',
  u'in',
  u'lourdes',
  u'france',
  u'?'],
 [[u'architecturally',
   u',',
   u'the',
   u'school',
   u'has',
   u'a',
   u'catholic',
   u'character',
   u'.'],
  [u'atop',
   u'the',
   u'main',
   u'building',
   u"'s",
   u'gold',
   u'dome',
   u'is',
   u'a',
   u'golden',
   u'statue',
   u'of',
   u'the',
   u'virgin',
   u'mary',
   u'.'],
  [u'immediately',
   u'in',
   u'front',
   u'of',
   u'the',
   u'main',
   u'building',
   u'and',
   u'facing',
   u'it',
   u',',
   u'is',
   u'a',
   u'copper',
   u'statue',
   u'of',
   u'christ',
   u'with',
   u'arms',
   u'upraised',
   u'with',
   u'the',
   u'legend',
   u'``',
   u'venite',
   u'ad',
   u'me',
   u'omnes',
   u"''",
   u'.'],
  [u'next',
   u'to',
   u'the',
   u'main',
   u'building',
   u'is',
   u'the',
   u'basilica',
   u'of',
   u'the',
   

In [11]:
data_num = []

for a, q, c in data:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num.append([answers, words_to_num(q), map(words_to_num, c)])

In [13]:
data_num[0]

[[(515, [1871, 11246, 70514])],
 [78116,
  49985,
  45432,
  67467,
  86834,
  71625,
  58400,
  29461,
  93998,
  55583,
  93998,
  72050,
  96784,
  82775],
 [[100416, 44790, 67467, 60466, 604, 43144, 88878, 32471, 488],
  [40539,
   67467,
   45649,
   54124,
   55580,
   78499,
   78216,
   19464,
   43144,
   51144,
   10775,
   23654,
   67467,
   86834,
   71625,
   488],
  [82731,
   93998,
   95282,
   23654,
   67467,
   45649,
   54124,
   49507,
   33330,
   19467,
   44790,
   19464,
   43144,
   84794,
   10775,
   23654,
   99211,
   1477,
   95970,
   37322,
   1477,
   67467,
   78193,
   64615,
   100635,
   14055,
   32694,
   66309,
   77274,
   488],
  [19356,
   78116,
   67467,
   45649,
   54124,
   19464,
   67467,
   32617,
   23654,
   67467,
   7958,
   80613,
   488],
  [82731,
   71360,
   67467,
   32617,
   19464,
   67467,
   16841,
   44790,
   43144,
   50115,
   87195,
   23654,
   54900,
   49507,
   18731,
   488],
  [19467,
   19464,
   43144,
   

In [14]:
data_num = [[l[0], [l[1]] + l[2]] for l in data_num]
data_num = [[[t[1] for t in l[0]], l[1]] for l in data_num]

In [15]:
data_num[0]

[[[1871, 11246, 70514]],
 [[78116,
   49985,
   45432,
   67467,
   86834,
   71625,
   58400,
   29461,
   93998,
   55583,
   93998,
   72050,
   96784,
   82775],
  [100416, 44790, 67467, 60466, 604, 43144, 88878, 32471, 488],
  [40539,
   67467,
   45649,
   54124,
   55580,
   78499,
   78216,
   19464,
   43144,
   51144,
   10775,
   23654,
   67467,
   86834,
   71625,
   488],
  [82731,
   93998,
   95282,
   23654,
   67467,
   45649,
   54124,
   49507,
   33330,
   19467,
   44790,
   19464,
   43144,
   84794,
   10775,
   23654,
   99211,
   1477,
   95970,
   37322,
   1477,
   67467,
   78193,
   64615,
   100635,
   14055,
   32694,
   66309,
   77274,
   488],
  [19356,
   78116,
   67467,
   45649,
   54124,
   19464,
   67467,
   32617,
   23654,
   67467,
   7958,
   80613,
   488],
  [82731,
   71360,
   67467,
   32617,
   19464,
   67467,
   16841,
   44790,
   43144,
   50115,
   87195,
   23654,
   54900,
   49507,
   18731,
   488],
  [19467,
   19464,
   431

In [13]:
# There are some broken answers, because of the tokenizer (I count words instead of characters)

k = 0
for a, q in data_num:
    for w in a[0]:
        if w not in list(chain(*q[1:])):
            k += 1
k

1028

In [18]:
a,q = data_num[0]
a

[[1871, 11246, 70514]]

### Find answer indices on words, not characters

In [14]:
inds = []

for a, q in data_num:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)

In [15]:
for i in xrange(len(data_num)):
    data_num[i][0] = inds[i]

In [56]:
data[0]

[[(515, [u'saint', u'bernadette', u'soubirous'])],
 [u'to',
  u'whom',
  u'did',
  u'the',
  u'virgin',
  u'mary',
  u'allegedly',
  u'appear',
  u'in',
  u'1858',
  u'in',
  u'lourdes',
  u'france',
  u'?'],
 [u'architecturally',
  u',',
  u'the',
  u'school',
  u'has',
  u'a',
  u'catholic',
  u'character',
  u'.',
  u'atop',
  u'the',
  u'main',
  u'building',
  u"'s",
  u'gold',
  u'dome',
  u'is',
  u'a',
  u'golden',
  u'statue',
  u'of',
  u'the',
  u'virgin',
  u'mary',
  u'.',
  u'immediately',
  u'in',
  u'front',
  u'of',
  u'the',
  u'main',
  u'building',
  u'and',
  u'facing',
  u'it',
  u',',
  u'is',
  u'a',
  u'copper',
  u'statue',
  u'of',
  u'christ',
  u'with',
  u'arms',
  u'upraised',
  u'with',
  u'the',
  u'legend',
  u'``',
  u'venite',
  u'ad',
  u'me',
  u'omnes',
  u"''",
  u'.',
  u'next',
  u'to',
  u'the',
  u'main',
  u'building',
  u'is',
  u'the',
  u'basilica',
  u'of',
  u'the',
  u'sacred',
  u'heart',
  u'.',
  u'immediately',
  u'behind',
  u'the

# Predictions

In [45]:
preds = np.load('evaluate/glove_vocab/pred_glove_premadeBin_charemb_all_fixed_ep3.npz')['arr_0']

In [46]:
with io.open('evaluate/glove_vocab/pred_glove_premadeBin_charemb_all_fixed_ep3.txt', 'w', encoding='utf-8') as f:
    f.write(u'{')
    for i in xrange(len(data_dev)):
        ans = ' '.join(data_dev[i][2][preds[i][0]:preds[i][1] + 1])
        Id = data_dev[i][3]
        f.write(u'"{}": "{}"'.format(Id, ans))
        if i < len(data_dev) - 1:
            f.write(u', ')
    f.write(u'}')

# Preprocess dev set

In [22]:
words_dev = set()
data_dev = []
lower = lambda x: x.lower()

for par in dev['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words_dev |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words_dev |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data_dev.append([answers, question_tok, context_tok, Id])
            
words_dev.add('<unk>')

In [34]:
data_dev[3353]

[[(81, [u'90']), (81, [u'90', u'%']), (81, [u'90', u'%'])],
 [u'what',
  u'percentage',
  u'of',
  u'electricity',
  u'was',
  u'made',
  u'by',
  u'steam',
  u'turbine',
  u'in',
  u'the',
  u'1990s',
  u'?'],
 [u'the',
  u'main',
  u'use',
  u'for',
  u'steam',
  u'turbines',
  u'is',
  u'in',
  u'electricity',
  u'generation',
  u'(',
  u'in',
  u'the',
  u'1990s',
  u'about',
  u'90',
  u'%',
  u'of',
  u'the',
  u'world',
  u"'s",
  u'electric',
  u'production',
  u'was',
  u'by',
  u'use',
  u'of',
  u'steam',
  u'turbines',
  u')',
  u'however',
  u'the',
  u'recent',
  u'widespread',
  u'application',
  u'of',
  u'large',
  u'gas',
  u'turbine',
  u'units',
  u'and',
  u'typical',
  u'combined',
  u'cycle',
  u'power',
  u'plants',
  u'has',
  u'resulted',
  u'in',
  u'reduction',
  u'of',
  u'this',
  u'percentage',
  u'to',
  u'the',
  u'80',
  u'%',
  u'regime',
  u'for',
  u'steam',
  u'turbines',
  u'.',
  u'in',
  u'electricity',
  u'production',
  u',',
  u'the',
  u'hig

In [43]:
with open('/pio/data/data/squad/dev.pkl', 'w') as f:
    pickle.dump(data_dev, f)

In [44]:
data_dev = np.load('/pio/data/data/squad/dev.pkl')

In [16]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = [w if w in words else '<unk>' for w in data_dev[i][2]]

In [122]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = split_on_dot(data_dev[i][2])

In [123]:
def words_to_num(s):
    return map(lambda x: glove_w_to_i.get(x, 0), s)

In [125]:
data_num_dev = []

for a, q, c, _ in data_dev:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num_dev.append([answers, words_to_num(q), map(words_to_num, c)])

In [126]:
data_num_dev = [[l[0], [l[1]] + l[2]] for l in data_num_dev]
data_num_dev = [[[t[1] for t in l[0]], l[1]] for l in data_num_dev]

In [127]:
inds = []

for a, q in data_num_dev:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)
    
for i in xrange(len(data_num_dev)):
    data_num_dev[i][0] = inds[i]

In [49]:
# data_num_dev = [[d[0]] + map(words_to_num, d[1:]) for d in data_dev]

In [132]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(data_num_dev, f)

### Get Glove vectors for words in data

In [195]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])

In [196]:
glove_i_to_w = glove_words
glove_w_to_i = {v:k for (k,v) in list(enumerate(glove_words))}

In [68]:
embs = np.zeros((len(w_to_i), 300), dtype=np.float32)
embs.shape

(102802, 300)

In [24]:
known_inds = [i for i in xrange(len(w_to_i)) if i_to_w[i] in glove_w_to_i]
len(known_inds)

73351

In [25]:
s_known = set(known_inds)
unknown_inds = [i for i in xrange(len(w_to_i)) if i not in s_known]
s_unknown = set(unknown_inds)

In [26]:
dev_num = np.load('/pio/data/data/squad/dev_with_training_vocab.pkl')

In [52]:
train_num = np.load('/pio/data/data/squad/train.pkl')

In [54]:
not_in_dev = words - words_dev

In [28]:
w_to_i['<unk>']

4445

In [59]:
for di in xrange(len(train_num)):
    for si in xrange(len(train_num[di][1])):
        for wi in xrange(len(train_num[di][1][si])):
            w = train_num[di][1][si][wi]
            if i_to_w[w] in not_in_dev:
                train_num[di][1][si][wi] = 4445

In [62]:
with open('/pio/data/data/squad/train_with_unks.pkl', 'w') as f:
    pickle.dump(train_num, f)

In [78]:
embs[known_inds] = glove_vec[[glove_w_to_i[i_to_w[i]] for i in known_inds]]
embs[unknown_inds] = L.init.Normal()((len(unknown_inds), 300))

## Some stats

In [55]:
len([w for d in data_dev for w in list(chain(*d[1:3])) if w in w_to_i])

1569040

In [36]:
# percentage of <unk> in dev
51889. / 1620929

0.03201188947819429

In [49]:
# % of dev set in train vocabulary
1569040. / 1620929

0.9679881105218057

In [47]:
# % of dev set in glove
1600564. / 1620929

0.9874362171322741

In [46]:
# <unk> in dev questions
2632. / 120950

0.021761058288548987

In [58]:
# no-devs in train
1070257. / 13061165

0.08194192478236054

In [40]:
# no-gloves in train
164757. / 13061165

0.01261426526653633

### Save processed data

In [165]:
sorted_words = map(lambda x: x[0], sorted(w_to_i.items(), key=lambda x: x[1]))

In [166]:
with io.open('/pio/data/data/squad/wordlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_words:
        f.write(unicode(w + '\n'))

In [170]:
# This file has a lot of redundant parts, context is repeated for each question.
# It only slows down the initial loading.

with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data_num, f)

In [2]:
data = np.load('/pio/data/data/squad/train.pkl')

In [12]:
w_to_i = {}
idx = 0

with io.open('/pio/data/data/squad/train_wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        w_to_i[line[:-1]] = idx
        idx += 1
        
i_to_w = {v:k for (k,v) in w_to_i.items()}

In [13]:
lens = np.array(map(lambda x: len(x[1]), data))

In [27]:
def show_data(idx):
    for s in data[idx][1]:
        print ' '.join([i_to_w[x] for x in s])

In [28]:
show_data(60023)

what is the largest hottest continuously large area worldwide ?
the sky is usually clear above the desert and the sunshine duration is extremely high everywhere in the sahara .
most of the desert enjoys more than 3,600 h of bright sunshine annually or over 82 % of the time and a wide area in the eastern part experiences in excess of 4,000 h of bright sunshine a year or over 91 % of the time , and the highest values are very close to the theoretical maximum value .
a value of 4,300 h or 98 % of the time would be recorded in upper egypt ( aswan , luxor ) and in the nubian desert ( wadi halfa ) .
the annual average direct solar irradiation is around 2,800 kwh/ ( m2 year ) in the great desert .
the sahara has a huge potential for solar energy production .
the constantly high position of the sun , the extremely low relative humidity , the lack of vegetation and rainfall make the great desert the hottest continuously large area worldwide and certainly the hottest place on earth during summer

In [14]:
print max(lens)
np.bincount(lens)

28


array([    0,     0,  1665,  6015, 12958, 18663, 16891, 12176,  7727,
        4709,  2711,  1677,   859,   636,   352,   223,   166,    60,
          39,    24,     9,     0,     5,    19,     5,     0,     0,
           0,    10])

## Fun with characters

In [None]:
# i_to_c = list(glove_chars)
# c_to_i = {v:k for (k,v) in i_to_c.items()}

In [None]:
# data powinno być bezpośrednio po wykonaniu okienka, w którym jest inicjowane words

In [18]:
# chars = {c for d in data for s in d[1:] for w in s for c in w}

In [46]:
# chars_dev = {c for d in data_dev for s in d[1:3] for w in s for c in w}

In [42]:
# chars.add('<unk>')

In [37]:
# 0 - unk
# 1 - start
# 2 - end
# 3 - not_a_word char (added later, in wikipedia negative samplesF)
# there are no 1s or 2s in data, so these are safe

chars = [unichr(i) for i in xrange(128)]

In [38]:
i_to_c = chars
c_to_i = {v:k for (k,v) in list(enumerate(chars))}

In [24]:
data_char = []

for _, q, x in data:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_char.append([q_char, x_char])

In [25]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in q]
    x_char = [[1] + [c_to_i.get(c, 0) for c in w] + [2] for w in x]
    data_dev_char.append([q_char, x_char])

In [41]:
data_dev[0][1:3]

[[u'which',
  u'nfl',
  u'team',
  u'represented',
  u'the',
  u'afc',
  u'at',
  u'super',
  u'bowl',
  u'50',
  u'?'],
 [u'super',
  u'bowl',
  u'50',
  u'was',
  u'an',
  u'american',
  u'football',
  u'game',
  u'to',
  u'determine',
  u'the',
  u'champion',
  u'of',
  u'the',
  u'national',
  u'football',
  u'league',
  u'(',
  u'nfl',
  u')',
  u'for',
  u'the',
  u'2015',
  u'season',
  u'.',
  u'the',
  u'american',
  u'football',
  u'conference',
  u'(',
  u'afc',
  u')',
  u'champion',
  u'denver',
  u'broncos',
  u'defeated',
  u'the',
  u'national',
  u'football',
  u'conference',
  u'(',
  u'nfc',
  u')',
  u'champion',
  u'carolina',
  u'panthers',
  u'24\u201310',
  u'to',
  u'earn',
  u'their',
  u'third',
  u'super',
  u'bowl',
  u'title',
  u'.',
  u'the',
  u'game',
  u'was',
  u'played',
  u'on',
  u'february',
  u'7',
  u',',
  u'2016',
  u',',
  u'at',
  u'levi',
  u"'s",
  u'stadium',
  u'in',
  u'the',
  u'san',
  u'francisco',
  u'bay',
  u'area',
  u'at',
  u'

In [39]:
data_dev_char[0]

[[[1, 119, 104, 105, 99, 104, 2],
  [1, 110, 102, 108, 2],
  [1, 116, 101, 97, 109, 2],
  [1, 114, 101, 112, 114, 101, 115, 101, 110, 116, 101, 100, 2],
  [1, 116, 104, 101, 2],
  [1, 97, 102, 99, 2],
  [1, 97, 116, 2],
  [1, 115, 117, 112, 101, 114, 2],
  [1, 98, 111, 119, 108, 2],
  [1, 53, 48, 2],
  [1, 63, 2]],
 [[1, 115, 117, 112, 101, 114, 2],
  [1, 98, 111, 119, 108, 2],
  [1, 53, 48, 2],
  [1, 119, 97, 115, 2],
  [1, 97, 110, 2],
  [1, 97, 109, 101, 114, 105, 99, 97, 110, 2],
  [1, 102, 111, 111, 116, 98, 97, 108, 108, 2],
  [1, 103, 97, 109, 101, 2],
  [1, 116, 111, 2],
  [1, 100, 101, 116, 101, 114, 109, 105, 110, 101, 2],
  [1, 116, 104, 101, 2],
  [1, 99, 104, 97, 109, 112, 105, 111, 110, 2],
  [1, 111, 102, 2],
  [1, 116, 104, 101, 2],
  [1, 110, 97, 116, 105, 111, 110, 97, 108, 2],
  [1, 102, 111, 111, 116, 98, 97, 108, 108, 2],
  [1, 108, 101, 97, 103, 117, 101, 2],
  [1, 40, 2],
  [1, 110, 102, 108, 2],
  [1, 41, 2],
  [1, 102, 111, 114, 2],
  [1, 116, 104, 101, 2],
  [

In [52]:
sorted_chars = map(lambda x: x[0], sorted(c_to_i.items(), key=lambda x: x[1]))

In [55]:
with io.open('/pio/data/data/squad/train_charlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_chars:
        f.write(unicode(w + '\n'))

In [42]:
with open('/pio/data/data/squad/train_char_ascii.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [43]:
with open('/pio/data/data/squad/dev_char_ascii.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)

# SQuAD data with glove dictionary

### add unk to glove

In [77]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
glove_words.insert(0, '<unk>')
glove_vec = np.vstack([glove_vec.mean(axis=0), glove_vec])

In [78]:
glove_i_to_w = glove_words
glove_w_to_i = {v:k for (k,v) in list(enumerate(glove_words))}

In [85]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

### make train and dev set with glove dict

#### train

In [100]:
train_set = np.load('/pio/data/data/squad/train.pkl')

In [101]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(train_set)):
    train_set[i].append(list(chain(*train_set[i][1][1:])))
    train_set[i][1] = train_set[i][1][0]

In [103]:
for di in xrange(len(train_set)):
    for si in xrange(len(train_set[di][1:])):
        for ii in xrange(len(train_set[di][1:][si])):
            i = train_set[di][1:][si][ii]
            train_set[di][1:][si][ii] = glove_w_to_i.get(i_to_w[i], 0)

In [105]:
with open('/pio/data/data/squad/train_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(train_set, f)

#### dev

In [133]:
# zrobione wyżej

In [136]:
train_set = np.load('/pio/data/data/squad/train_with_glove_vocab.pkl')

In [138]:
dev_set = np.load('/pio/data/data/squad/dev_with_glove_vocab.pkl')

In [139]:
# Originally contexts are split into sentences, this reverses that.
for i in xrange(len(dev_set)):
    dev_set[i].append(list(chain(*dev_set[i][1][1:])))
    dev_set[i][1] = dev_set[i][1][0]

In [141]:
with open('/pio/data/data/squad/dev_with_glove_vocab.pkl', 'w') as f:
    pickle.dump(dev_set, f)

### characters

In [None]:
# data powinno być bezpośrednio po wykonaniu okienka, w którym jest inicjowane words

In [20]:
glove_chars = sorted({c for w in glove_w_to_i for c in w})
glove_chars.insert(0, '<unk_char>')

In [25]:
glove_i_to_c = glove_chars
glove_c_to_i = {v:k for (k,v) in list(enumerate(glove_chars))}

In [22]:
with io.open('/pio/data/data/glove_vec/6B/glove.6B.charlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_chars:
        f.write(unicode(w + '\n'))

In [29]:
data_char = []

for _, q, x in data:
    q_char = [[glove_c_to_i.get(c, 0) for c in w] for w in q]
    x_char = [[glove_c_to_i.get(c, 0) for c in w] for w in x]
    data_char.append([q_char, x_char])

In [30]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[glove_c_to_i.get(c, 0) for c in w] for w in q]
    x_char = [[glove_c_to_i.get(c, 0) for c in w] for w in x]
    data_dev_char.append([q_char, x_char])

In [33]:
with open('/pio/data/data/squad/train_char_with_glove_alphabet.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [34]:
with open('/pio/data/data/squad/dev_char_with_glove_alphabet.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)