In [1]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain

### Read data

In [6]:
with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)

In [2]:
with io.open('/pio/data/data/squad/dev-v1.1.json', 'r', encoding='utf-8') as f:
    dev = json.load(f)

### Data structure

In [4]:
dev['data'][0]['paragraphs'][0]['qas'][0]['answers']

[{u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'},
 {u'answer_start': 177, u'text': u'Denver Broncos'}]

In [95]:
len(dev['data'][0]['paragraphs'][0]['qas'][0])

3

In [96]:
dev['data'][0]['paragraphs'][0]['qas'][0].keys()

[u'question', u'id', u'answers']

In [190]:
' '.join(nltk.word_tokenize(train['data'][10]['paragraphs'][60]['context'])).split(' . ')

[u"The State Council declared a three-day period of national mourning for the quake victims starting from May 19 , 2008 ; the PRC 's National Flag and Regional Flags of Hong Kong and Macau Special Administrative Regions flown at half mast",
 u'It was the first time that a national mourning period had been declared for something other than the death of a state leader , and many have called it the biggest display of mourning since the death of Mao Zedong',
 u'At 14:28 CST on May 19 , 2008 , a week after the earthquake , the Chinese public held a moment of silence',
 u'People stood silent for three minutes while air defense , police and fire sirens , and the horns of vehicles , vessels and trains sounded',
 u"Cars and trucks on Beijing 's roads also came to a halt",
 u"People spontaneously burst into cheering `` Zhongguo jiayou ! '' ( Let 's go , China ! ) and `` Sichuan jiayou '' ( Let 's go , Sichuan ! ) afterwards ."]

# Glove

### Save glove vectors as npz

In [37]:
glove_vec = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_vec.append(np.matrix(str(' '.join(line.split()[1:]))))
        
glove_vec = np.vstack(glove_vec).astype(np.float32)

In [38]:
glove_vec.shape

(400000, 300)

In [40]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

### Glove words

In [10]:
# create a glove wordlist

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

# Preprocess training set

### Grab all the question-answer pairs and create a wordlist

In [7]:
words = set()
data = []
lower = lambda x: x.lower()

for par in train['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data.append([answers, question_tok, context_tok])
            
words.add('<unk>')

In [5]:
data[27]

[[(92, [u'1854'])],
 [u'in',
  u'what',
  u'year',
  u'was',
  u'a',
  u'master',
  u'of',
  u'arts',
  u'course',
  u'first',
  u'offered',
  u'at',
  u'notre',
  u'dame',
  u'?'],
 [u'the',
  u'university',
  u'first',
  u'offered',
  u'graduate',
  u'degrees',
  u',',
  u'in',
  u'the',
  u'form',
  u'of',
  u'a',
  u'master',
  u'of',
  u'arts',
  u'(',
  u'ma',
  u')',
  u',',
  u'in',
  u'the',
  u'1854\u20131855',
  u'academic',
  u'year',
  u'.',
  u'the',
  u'program',
  u'expanded',
  u'to',
  u'include',
  u'master',
  u'of',
  u'laws',
  u'(',
  u'll.m',
  u'.',
  u')',
  u'and',
  u'master',
  u'of',
  u'civil',
  u'engineering',
  u'in',
  u'its',
  u'early',
  u'stages',
  u'of',
  u'growth',
  u',',
  u'before',
  u'a',
  u'formal',
  u'graduate',
  u'school',
  u'education',
  u'was',
  u'developed',
  u'with',
  u'a',
  u'thesis',
  u'not',
  u'required',
  u'to',
  u'receive',
  u'the',
  u'degrees',
  u'.',
  u'this',
  u'changed',
  u'in',
  u'1924',
  u'with',
  u

In [6]:
print len(data), len(words)

87599 102802


### Turn words into numbers

In [8]:
i_to_w = dict(enumerate(words))
w_to_i = {v:k for (k,v) in i_to_w.items()}

In [10]:
def split_on_dot(s):
    res = [[]]
    for w in s:
        res[-1].append(w)
        if w == u'.':
            res.append([])
    return res if res[-1] else res[:-1]

def words_to_num(s):
    return map(lambda x: w_to_i.get(x, w_to_i['<unk>']), s)

In [10]:
for i in xrange(len(data)):
    data[i][2] = split_on_dot(data[i][2])

In [11]:
data_num = []

for a, q, c in data:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num.append([answers, words_to_num(q), map(words_to_num, c)])

In [12]:
data_num = [[l[0], [l[1]] + l[2]] for l in data_num]
data_num = [[[t[1] for t in l[0]], l[1]] for l in data_num]

In [13]:
# There are some broken answers, because of the tokenizer (I count words instead of characters)

k = 0
for a, q in data_num:
    for w in a[0]:
        if w not in list(chain(*q[1:])):
            k += 1
k

1028

### Find answer indices on words, not characters

In [14]:
inds = []

for a, q in data_num:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)

In [15]:
for i in xrange(len(data_num)):
    data_num[i][0] = inds[i]

In [102]:
data_dev[0][2]

[u'super',
 u'bowl',
 u'50',
 u'was',
 u'an',
 u'american',
 u'football',
 u'game',
 u'to',
 u'determine',
 u'the',
 u'champion',
 u'of',
 u'the',
 u'national',
 u'football',
 u'league',
 u'(',
 u'nfl',
 u')',
 u'for',
 u'the',
 u'2015',
 u'season',
 u'.',
 u'the',
 u'american',
 u'football',
 u'conference',
 u'(',
 u'afc',
 u')',
 u'champion',
 u'denver',
 u'broncos',
 u'defeated',
 u'the',
 u'national',
 u'football',
 u'conference',
 u'(',
 u'nfc',
 u')',
 u'champion',
 u'carolina',
 u'panthers',
 u'24\u201310',
 u'to',
 u'earn',
 u'their',
 u'third',
 u'super',
 u'bowl',
 u'title',
 u'.',
 u'the',
 u'game',
 u'was',
 u'played',
 u'on',
 u'february',
 u'7',
 u',',
 u'2016',
 u',',
 u'at',
 u'levi',
 u"'s",
 u'stadium',
 u'in',
 u'the',
 u'san',
 u'francisco',
 u'bay',
 u'area',
 u'at',
 u'santa',
 u'clara',
 u',',
 u'california',
 u'.',
 u'as',
 u'this',
 u'was',
 u'the',
 u'50th',
 u'super',
 u'bowl',
 u',',
 u'the',
 u'league',
 u'emphasized',
 u'the',
 u'``',
 u'golden',
 u'annive

In [14]:
data_dev[0][2]

[[u'super',
  u'bowl',
  u'50',
  u'was',
  u'an',
  u'american',
  u'football',
  u'game',
  u'to',
  u'determine',
  u'the',
  u'champion',
  u'of',
  u'the',
  u'national',
  u'football',
  u'league',
  u'(',
  u'nfl',
  u')',
  u'for',
  u'the',
  u'2015',
  u'season',
  u'.'],
 [u'the',
  u'american',
  u'football',
  u'conference',
  u'(',
  u'afc',
  u')',
  u'champion',
  u'denver',
  u'broncos',
  u'defeated',
  u'the',
  u'national',
  u'football',
  u'conference',
  u'(',
  u'nfc',
  u')',
  u'champion',
  u'carolina',
  u'panthers',
  '<unk>',
  u'to',
  u'earn',
  u'their',
  u'third',
  u'super',
  u'bowl',
  u'title',
  u'.'],
 [u'the',
  u'game',
  u'was',
  u'played',
  u'on',
  u'february',
  u'7',
  u',',
  u'2016',
  u',',
  u'at',
  u'levi',
  u"'s",
  u'stadium',
  u'in',
  u'the',
  u'san',
  u'francisco',
  u'bay',
  u'area',
  u'at',
  u'santa',
  u'clara',
  u',',
  u'california',
  u'.'],
 [u'as',
  u'this',
  u'was',
  u'the',
  u'50th',
  u'super',
  u'bowl

# Predictions

In [12]:
preds = np.load('dev_with_training_vocab_predictions_charemb.npz')['arr_0']

In [17]:
with io.open('dev_with_training_vocab_predictions_charemb.txt', 'w', encoding='utf-8') as f:
    f.write(u'{')
    for i in xrange(len(data_dev)):
        ans = ' '.join(data_dev[i][2][preds[i][0]:preds[i][1] + 1])
        Id = data_dev[i][3]
        f.write(u'"{}": "{}"'.format(Id, ans))
        if i < len(data_dev) - 1:
            f.write(u', ')
    f.write(u'}')

# Preprocess dev set

In [15]:
words_dev = set()
data_dev = []
lower = lambda x: x.lower()

for par in dev['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words_dev |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words_dev |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data_dev.append([answers, question_tok, context_tok, Id])
            
words_dev.add('<unk>')

In [4]:
data_dev[120]

[[(102, [u'sun', u'life', u'stadium']),
  (102, [u'sun', u'life', u'stadium']),
  (102, [u'sun', u'life', u'stadium'])],
 [u'what',
  u'venue',
  u'in',
  u'miami',
  u'was',
  u'a',
  u'candidate',
  u'for',
  u'the',
  u'site',
  u'of',
  u'super',
  u'bowl',
  u'50',
  u'?'],
 [u'the',
  u'league',
  u'eventually',
  u'narrowed',
  u'the',
  u'bids',
  u'to',
  u'three',
  u'sites',
  u':',
  u'new',
  u'orleans',
  u"'",
  u'mercedes-benz',
  u'superdome',
  u',',
  u'miami',
  u"'s",
  u'sun',
  u'life',
  u'stadium',
  u',',
  u'and',
  u'the',
  u'san',
  u'francisco',
  u'bay',
  u'area',
  u"'s",
  u'levi',
  u"'s",
  u'stadium',
  u'.'],
 u'56beb03c3aeaaa14008c920b']

In [52]:
print len(data_dev), len(words_dev)

10570 26453


In [20]:
len(words_dev - words)

6854

In [16]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = [w if w in words else '<unk>' for w in data_dev[i][2]]

In [11]:
for i in xrange(len(data_dev)):
    data_dev[i][2] = split_on_dot(data_dev[i][2])

In [55]:
data_num_dev = []

for a, q, c in data_dev:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num_dev.append([answers, words_to_num(q), map(words_to_num, c)])

In [57]:
data_num_dev = [[l[0], [l[1]] + l[2]] for l in data_num_dev]
data_num_dev = [[[t[1] for t in l[0]], l[1]] for l in data_num_dev]

In [60]:
inds = []

for a, q in data_num_dev:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)
    
for i in xrange(len(data_num_dev)):
    data_num_dev[i][0] = inds[i]

In [49]:
# data_num_dev = [[d[0]] + map(words_to_num, d[1:]) for d in data_dev]

In [65]:
with open('/pio/data/data/squad/dev_with_training_vocab.pkl', 'w') as f:
    pickle.dump(data_num_dev, f)

### Get Glove vectors for words in data

In [69]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])

In [63]:
glove_i_to_w = dict(enumerate(glove_words))
glove_w_to_i = {v:k for (k,v) in glove_i_to_w.items()}

In [68]:
embs = np.zeros((len(words), 300), dtype=np.float32)
embs.shape

(102802, 300)

In [77]:
known_inds = [i for i in xrange(len(words)) if i_to_w[i] in glove_w_to_i]
len(known_inds)

73351

In [94]:
s = set(known_inds)
unknown_inds = [i for i in xrange(len(words)) if i not in s]

In [78]:
embs[known_inds] = glove_vec[[glove_w_to_i[i_to_w[i]] for i in known_inds]]

In [96]:
embs[unknown_inds] = L.init.Normal()((len(unknown_inds), 300))

### Save processed data

In [165]:
sorted_words = map(lambda x: x[0], sorted(w_to_i.items(), key=lambda x: x[1]))

In [166]:
with io.open('/pio/data/data/squad/wordlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_words:
        f.write(unicode(w + '\n'))

In [170]:
# This file has a lot of redundant parts, context is repeated for each question.
# It only slows down the initial loading.

with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data_num, f)

In [2]:
data = np.load('/pio/data/data/squad/train.pkl')

In [7]:
w_to_i = {}
idx = 0

with io.open('/pio/data/data/squad/wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        w_to_i[line[:-1]] = idx
        idx += 1
        
i_to_w = {v:k for (k,v) in w_to_i.items()}

In [21]:
i_to_w[19557]

u'it'

In [13]:
lens = np.array(map(lambda x: len(x[1]), data))

In [27]:
def show_data(idx):
    for s in data[idx][1]:
        print ' '.join([i_to_w[x] for x in s])

In [28]:
show_data(60023)

what is the largest hottest continuously large area worldwide ?
the sky is usually clear above the desert and the sunshine duration is extremely high everywhere in the sahara .
most of the desert enjoys more than 3,600 h of bright sunshine annually or over 82 % of the time and a wide area in the eastern part experiences in excess of 4,000 h of bright sunshine a year or over 91 % of the time , and the highest values are very close to the theoretical maximum value .
a value of 4,300 h or 98 % of the time would be recorded in upper egypt ( aswan , luxor ) and in the nubian desert ( wadi halfa ) .
the annual average direct solar irradiation is around 2,800 kwh/ ( m2 year ) in the great desert .
the sahara has a huge potential for solar energy production .
the constantly high position of the sun , the extremely low relative humidity , the lack of vegetation and rainfall make the great desert the hottest continuously large area worldwide and certainly the hottest place on earth during summer

In [14]:
print max(lens)
np.bincount(lens)

28


array([    0,     0,  1665,  6015, 12958, 18663, 16891, 12176,  7727,
        4709,  2711,  1677,   859,   636,   352,   223,   166,    60,
          39,    24,     9,     0,     5,    19,     5,     0,     0,
           0,    10])

In [10]:
list(chain(*data[0][1:]))

[u'to',
 u'whom',
 u'did',
 u'the',
 u'virgin',
 u'mary',
 u'allegedly',
 u'appear',
 u'in',
 u'1858',
 u'in',
 u'lourdes',
 u'france',
 u'?',
 u'architecturally',
 u',',
 u'the',
 u'school',
 u'has',
 u'a',
 u'catholic',
 u'character',
 u'.',
 u'atop',
 u'the',
 u'main',
 u'building',
 u"'s",
 u'gold',
 u'dome',
 u'is',
 u'a',
 u'golden',
 u'statue',
 u'of',
 u'the',
 u'virgin',
 u'mary',
 u'.',
 u'immediately',
 u'in',
 u'front',
 u'of',
 u'the',
 u'main',
 u'building',
 u'and',
 u'facing',
 u'it',
 u',',
 u'is',
 u'a',
 u'copper',
 u'statue',
 u'of',
 u'christ',
 u'with',
 u'arms',
 u'upraised',
 u'with',
 u'the',
 u'legend',
 u'``',
 u'venite',
 u'ad',
 u'me',
 u'omnes',
 u"''",
 u'.',
 u'next',
 u'to',
 u'the',
 u'main',
 u'building',
 u'is',
 u'the',
 u'basilica',
 u'of',
 u'the',
 u'sacred',
 u'heart',
 u'.',
 u'immediately',
 u'behind',
 u'the',
 u'basilica',
 u'is',
 u'the',
 u'grotto',
 u',',
 u'a',
 u'marian',
 u'place',
 u'of',
 u'prayer',
 u'and',
 u'reflection',
 u'.',
 u

## Fun with characters

In [None]:
# data powinno być bezpośrednio po wykonaniu okienka, w którym jest inicjowane words

In [11]:
chars = {c for d in data for s in d[1:] for w in s for c in w}

In [46]:
chars_dev = {c for d in data_dev for s in d[1:3] for w in s for c in w}

In [42]:
chars.add('<unk>')

In [43]:
i_to_c = dict(enumerate(chars))
c_to_i = {v:k for (k,v) in i_to_c.items()}

In [30]:
data_char = []

for _, q, x in data:
    q_char = [[c_to_i[c] for c in w] for w in q]
    x_char = [[c_to_i[c] for c in w] for w in x]
    data_char.append([q_char, x_char])

In [44]:
data_dev_char = []

for _, q, x, _ in data_dev:
    q_char = [[c_to_i.get(c, c_to_i['<unk>']) for c in w] for w in q]
    x_char = [[c_to_i.get(c, c_to_i['<unk>']) for c in w] for w in x]
    data_dev_char.append([q_char, x_char])

In [48]:
data_dev[0][1:3]

[[u'which',
  u'nfl',
  u'team',
  u'represented',
  u'the',
  u'afc',
  u'at',
  u'super',
  u'bowl',
  u'50',
  u'?'],
 [u'super',
  u'bowl',
  u'50',
  u'was',
  u'an',
  u'american',
  u'football',
  u'game',
  u'to',
  u'determine',
  u'the',
  u'champion',
  u'of',
  u'the',
  u'national',
  u'football',
  u'league',
  u'(',
  u'nfl',
  u')',
  u'for',
  u'the',
  u'2015',
  u'season',
  u'.',
  u'the',
  u'american',
  u'football',
  u'conference',
  u'(',
  u'afc',
  u')',
  u'champion',
  u'denver',
  u'broncos',
  u'defeated',
  u'the',
  u'national',
  u'football',
  u'conference',
  u'(',
  u'nfc',
  u')',
  u'champion',
  u'carolina',
  u'panthers',
  u'24\u201310',
  u'to',
  u'earn',
  u'their',
  u'third',
  u'super',
  u'bowl',
  u'title',
  u'.',
  u'the',
  u'game',
  u'was',
  u'played',
  u'on',
  u'february',
  u'7',
  u',',
  u'2016',
  u',',
  u'at',
  u'levi',
  u"'s",
  u'stadium',
  u'in',
  u'the',
  u'san',
  u'francisco',
  u'bay',
  u'area',
  u'at',
  u'

In [49]:
data_dev_char[0]

[[[470, 679, 919, 754, 679],
  [96, 223, 386],
  [1015, 1241, 304, 612],
  [541, 1241, 75, 541, 1241, 766, 1241, 96, 1015, 1241, 1005],
  [1015, 679, 1241],
  [304, 223, 754],
  [304, 1015],
  [766, 1248, 75, 1241, 541],
  [666, 1088, 470, 386],
  [1209, 30],
  [76]],
 [[766, 1248, 75, 1241, 541],
  [666, 1088, 470, 386],
  [1209, 30],
  [470, 304, 766],
  [304, 96],
  [304, 612, 1241, 541, 919, 754, 304, 96],
  [223, 1088, 1088, 1015, 666, 304, 386, 386],
  [460, 304, 612, 1241],
  [1015, 1088],
  [1005, 1241, 1015, 1241, 541, 612, 919, 96, 1241],
  [1015, 679, 1241],
  [754, 679, 304, 612, 75, 919, 1088, 96],
  [1088, 223],
  [1015, 679, 1241],
  [96, 304, 1015, 919, 1088, 96, 304, 386],
  [223, 1088, 1088, 1015, 666, 304, 386, 386],
  [386, 1241, 304, 460, 1248, 1241],
  [641],
  [96, 223, 386],
  [876],
  [223, 1088, 541],
  [1015, 679, 1241],
  [504, 30, 270, 1209],
  [766, 1241, 304, 766, 1088, 96],
  [801],
  [1015, 679, 1241],
  [304, 612, 1241, 541, 919, 754, 304, 96],
  [223,

In [52]:
sorted_chars = map(lambda x: x[0], sorted(c_to_i.items(), key=lambda x: x[1]))

In [55]:
with io.open('/pio/data/data/squad/train_charlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_chars:
        f.write(unicode(w + '\n'))

In [56]:
with open('/pio/data/data/squad/train_char.pkl', 'w') as f:
    pickle.dump(data_char, f)

In [57]:
with open('/pio/data/data/squad/dev_char_with_training_charlist.pkl', 'w') as f:
    pickle.dump(data_dev_char, f)