In [1]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain

### Read data

In [2]:
with io.open('/pio/data/data/squad/train-v1.1.json', 'r', encoding='utf-8') as f:
    train = json.load(f)

In [191]:
with io.open('/pio/data/data/squad/dev-v1.1.json', 'r', encoding='utf-8') as f:
    dev = json.load(f)

### Data structure

In [3]:
train['data'][0]['paragraphs'][0]['qas'][0]['answers']

[{u'answer_start': 515, u'text': u'Saint Bernadette Soubirous'}]

In [190]:
' '.join(nltk.word_tokenize(train['data'][10]['paragraphs'][60]['context'])).split(' . ')

[u"The State Council declared a three-day period of national mourning for the quake victims starting from May 19 , 2008 ; the PRC 's National Flag and Regional Flags of Hong Kong and Macau Special Administrative Regions flown at half mast",
 u'It was the first time that a national mourning period had been declared for something other than the death of a state leader , and many have called it the biggest display of mourning since the death of Mao Zedong',
 u'At 14:28 CST on May 19 , 2008 , a week after the earthquake , the Chinese public held a moment of silence',
 u'People stood silent for three minutes while air defense , police and fire sirens , and the horns of vehicles , vessels and trains sounded',
 u"Cars and trucks on Beijing 's roads also came to a halt",
 u"People spontaneously burst into cheering `` Zhongguo jiayou ! '' ( Let 's go , China ! ) and `` Sichuan jiayou '' ( Let 's go , Sichuan ! ) afterwards ."]

### Save glove vectors as npz

In [37]:
glove_vec = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_vec.append(np.matrix(str(' '.join(line.split()[1:]))))
        
glove_vec = np.vstack(glove_vec).astype(np.float32)

In [38]:
glove_vec.shape

(400000, 300)

In [40]:
np.save('/pio/data/data/glove_vec/6B/glove.6B.300d', glove_vec)

### Glove words

In [10]:
# create a glove wordlist

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])
        
with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'w', encoding='utf-8') as f:
    for w in glove_words:
        f.write(unicode(w + '\n'))

In [15]:
len(glove_words)

400000

In [45]:
data[0]

[[(515, [u'saint', u'bernadette', u'soubirous'])],
 [u'to',
  u'whom',
  u'did',
  u'the',
  u'virgin',
  u'mary',
  u'allegedly',
  u'appear',
  u'in',
  u'1858',
  u'in',
  u'lourdes',
  u'france',
  u'?'],
 [[u'architecturally',
   u',',
   u'the',
   u'school',
   u'has',
   u'a',
   u'catholic',
   u'character',
   u'.'],
  [u'atop',
   u'the',
   u'main',
   u'building',
   u"'s",
   u'gold',
   u'dome',
   u'is',
   u'a',
   u'golden',
   u'statue',
   u'of',
   u'the',
   u'virgin',
   u'mary',
   u'.'],
  [u'immediately',
   u'in',
   u'front',
   u'of',
   u'the',
   u'main',
   u'building',
   u'and',
   u'facing',
   u'it',
   u',',
   u'is',
   u'a',
   u'copper',
   u'statue',
   u'of',
   u'christ',
   u'with',
   u'arms',
   u'upraised',
   u'with',
   u'the',
   u'legend',
   u'``',
   u'venite',
   u'ad',
   u'me',
   u'omnes',
   u"''",
   u'.'],
  [u'next',
   u'to',
   u'the',
   u'main',
   u'building',
   u'is',
   u'the',
   u'basilica',
   u'of',
   u'the',
   

In [50]:
answer_words = {w for l in data for w in l[0][0][1]}

In [53]:
len(answer_words)

42848

In [52]:
len(answer_words - set(glove_words))

6659

In [39]:
for w in list(words - set(glove_words))[-100:]:
    print w

代田法
vergiliana
tampa/st
samarran
cove-st.
:35–36
ddirectly
aerodome
neminem
27,573
knon
sytle
1889–1894
brance
morphologist
v-pet
re-targeted
kirants
1,269,765
stain-resistant
5151
king—implied
11+
יִשְׂרָאֵל‎
post-divorce
əˈlɛktrik
hamović
geexbox
leaa
bing-based
71–91
csad
statenvertaling
“all
británico
refromed
all-in-all
church’s
隶书
poety
al-irsyad
finger-muscle
1976–77
nàng
wouldn’t
mandarines
league—had
i/ˈbɒstən/
μουσηγέτης
faqīh
western-funded
quitensis
conímbriga
supermalls
writers.”
government—as
riwaaydo
265,517
achenial
iparque
temnospondyli
circulaires
2,361.6
'haut
magnesium-doped
/ælps/
90,731
rice/millet
loan-translation
baladhuri
pedanius
dönitz—would—on
phytogeographically
pre-arthropod
eritrean–ethiopian
neo-grec
'la
9,902
β-chitin
sin-itiro
4,671
livability.com
copers
vendée—a
2240:1982
πάνορμος
⟨ʰp⟩
minacs
'time
obscurations
1664-1729
tsimshianic
jingning
p'ohang-dong
tax-reform
'intellectual
post-idol
watson-wentworth
hamovic
2014—on


In [28]:
len(words)

102802

### Grab all the question-answer pairs and create a wordlist

In [4]:
words = set()
data = []
lower = lambda x: x.lower()

for par in train['data']:
    title = par['title']
    
    for con in par['paragraphs']:
        context = con['context']
        context_tok = map(lower, nltk.word_tokenize(context))
        words |= set(context_tok)
        
        for q in con['qas']:
            question = q['question']
            question_tok = map(lower, nltk.word_tokenize(question))
            words |= set(question_tok)
            
            Id = q['id']
            
            answers = []
            
            for ans in q['answers']:
                text = ans['text']
                text_tok = map(lower, nltk.word_tokenize(text))
                ans_start = ans['answer_start']
                
                answers.append((ans_start, text_tok))
                
            data.append([answers, question_tok, context_tok])
            
words.add('<unk>')

In [5]:
data[27]

[[(92, [u'1854'])],
 [u'in',
  u'what',
  u'year',
  u'was',
  u'a',
  u'master',
  u'of',
  u'arts',
  u'course',
  u'first',
  u'offered',
  u'at',
  u'notre',
  u'dame',
  u'?'],
 [u'the',
  u'university',
  u'first',
  u'offered',
  u'graduate',
  u'degrees',
  u',',
  u'in',
  u'the',
  u'form',
  u'of',
  u'a',
  u'master',
  u'of',
  u'arts',
  u'(',
  u'ma',
  u')',
  u',',
  u'in',
  u'the',
  u'1854\u20131855',
  u'academic',
  u'year',
  u'.',
  u'the',
  u'program',
  u'expanded',
  u'to',
  u'include',
  u'master',
  u'of',
  u'laws',
  u'(',
  u'll.m',
  u'.',
  u')',
  u'and',
  u'master',
  u'of',
  u'civil',
  u'engineering',
  u'in',
  u'its',
  u'early',
  u'stages',
  u'of',
  u'growth',
  u',',
  u'before',
  u'a',
  u'formal',
  u'graduate',
  u'school',
  u'education',
  u'was',
  u'developed',
  u'with',
  u'a',
  u'thesis',
  u'not',
  u'required',
  u'to',
  u'receive',
  u'the',
  u'degrees',
  u'.',
  u'this',
  u'changed',
  u'in',
  u'1924',
  u'with',
  u

In [47]:
print len(data), len(words)

87599 102802


In [109]:
for d in data:
    if len(d[0]) > 1:
        print d
        break

### Turn words into numbers

In [6]:
i_to_w = dict(enumerate(words))
w_to_i = {v:k for (k,v) in i_to_w.items()}

In [7]:
def split_on_dot(s):
    res = [[]]
    for w in s:
        res[-1].append(w)
        if w == u'.':
            res.append([])
    return res if res[-1] else res[:-1]

def words_to_num(s):
    return map(lambda x: w_to_i.get(x, w_to_i['<unk>']), s)

In [8]:
for i in xrange(len(data)):
    data[i][2] = split_on_dot(data[i][2])

In [15]:
data_num = []

for a, q, c in data:
    answers = []
    for ans in a:
        answers.append((ans[0], words_to_num(ans[1])))        
    data_num.append([answers, words_to_num(q), map(words_to_num, c)])

In [9]:
# Some answers don't work, because of the tokenizer

bugged_answers = 0

for ans,_,_ in data_num:
    for _,a in ans:
        if w_to_i['<unk>'] in a:
            bugged_answers += 1
bugged_answers

80

In [17]:
data_num = [[l[0], [l[1]] + l[2]] for l in data_num]
data_num = [[[t[1] for t in l[0]], l[1]] for l in data_num]

In [21]:
print  u'1854\u20131855'

1854–1855


In [20]:
data[27]

[[(92, [u'1854'])],
 [u'in',
  u'what',
  u'year',
  u'was',
  u'a',
  u'master',
  u'of',
  u'arts',
  u'course',
  u'first',
  u'offered',
  u'at',
  u'notre',
  u'dame',
  u'?'],
 [[u'the',
   u'university',
   u'first',
   u'offered',
   u'graduate',
   u'degrees',
   u',',
   u'in',
   u'the',
   u'form',
   u'of',
   u'a',
   u'master',
   u'of',
   u'arts',
   u'(',
   u'ma',
   u')',
   u',',
   u'in',
   u'the',
   u'1854\u20131855',
   u'academic',
   u'year',
   u'.'],
  [u'the',
   u'program',
   u'expanded',
   u'to',
   u'include',
   u'master',
   u'of',
   u'laws',
   u'(',
   u'll.m',
   u'.'],
  [u')',
   u'and',
   u'master',
   u'of',
   u'civil',
   u'engineering',
   u'in',
   u'its',
   u'early',
   u'stages',
   u'of',
   u'growth',
   u',',
   u'before',
   u'a',
   u'formal',
   u'graduate',
   u'school',
   u'education',
   u'was',
   u'developed',
   u'with',
   u'a',
   u'thesis',
   u'not',
   u'required',
   u'to',
   u'receive',
   u'the',
   u'degrees',

In [19]:
i_to_w[55804]

u'1854'

In [15]:
# There are more broken answers, because I tag words instead of characters

k = 0
for a, q in data_num:
    for w in a[0]:
        if w not in list(chain(*q[1:])):
            k += 1
k

1028

### Find answer indices on words, not characters

In [12]:
inds = []

for a, q in data_num:
    ans = []
    tot_q = list(chain(*q[1:]))
    for x in a:
        for i in xrange(len(tot_q)):
            if x == tot_q[i:i+len(x)]:
                ans.append(list(xrange(i, i + len(x))))
                break
    inds.append(ans)

In [13]:
for i in xrange(len(data_num)):
    data_num[i][0] = inds[i]

In [14]:
data_num[27]

[[],
 [[94338,
   88056,
   4219,
   80700,
   43315,
   300,
   23764,
   20373,
   90669,
   7347,
   71687,
   14138,
   87486,
   82100,
   83077],
  [67711,
   5699,
   7347,
   71687,
   35050,
   79376,
   44968,
   94338,
   67711,
   71053,
   23764,
   43315,
   300,
   23764,
   20373,
   87309,
   22408,
   60426,
   44968,
   94338,
   67711,
   81578,
   573,
   4219,
   492],
  [67711, 23904, 53352, 78406, 11829, 300, 23764, 26661, 87309, 63399, 492],
  [60426,
   49698,
   300,
   23764,
   91096,
   72713,
   94338,
   91506,
   53403,
   45511,
   23764,
   25328,
   44968,
   57708,
   43315,
   79013,
   35050,
   60695,
   6191,
   80700,
   88197,
   1485,
   43315,
   73766,
   55379,
   18193,
   78406,
   40865,
   67711,
   79376,
   492],
  [40921,
   99595,
   94338,
   10684,
   1485,
   79013,
   23245,
   88197,
   64574,
   35050,
   79376,
   44968,
   78403,
   45485,
   47527,
   87309,
   66899,
   60426,
   79376,
   492],
  [10347, 35451, 23764, 67

### Get Glove vectors for words in data

In [69]:
glove_vec = np.load('/pio/data/data/glove_vec/6B/glove.6B.300d.npy')

glove_words = []

with io.open('/pio/data/data/glove_vec/6B/glove.6B.wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glove_words.append(line.split()[0])

In [63]:
glove_i_to_w = dict(enumerate(glove_words))
glove_w_to_i = {v:k for (k,v) in glove_i_to_w.items()}

In [68]:
embs = np.zeros((len(words), 300), dtype=np.float32)
embs.shape

(102802, 300)

In [77]:
known_inds = [i for i in xrange(len(words)) if i_to_w[i] in glove_w_to_i]
len(known_inds)

73351

In [94]:
s = set(known_inds)
unknown_inds = [i for i in xrange(len(words)) if i not in s]

In [78]:
embs[known_inds] = glove_vec[[glove_w_to_i[i_to_w[i]] for i in known_inds]]

In [96]:
embs[unknown_inds] = L.init.Normal()((len(unknown_inds), 300))

### Save processed data

In [165]:
sorted_words = map(lambda x: x[0], sorted(w_to_i.items(), key=lambda x: x[1]))

In [166]:
with io.open('/pio/data/data/squad/wordlist.txt', 'w', encoding='utf-8') as f:
    for w in sorted_words:
        f.write(unicode(w + '\n'))

In [170]:
# This file has a lot of redundant parts, context is repeated for each question.
# It only slows down the initial loading.

with open('/pio/data/data/squad/train.pkl', 'w') as f:
    pickle.dump(data_num, f)

In [2]:
data = np.load('/pio/data/data/squad/train.pkl')

In [7]:
w_to_i = {}
idx = 0

with io.open('/pio/data/data/squad/wordlist.txt', 'r', encoding='utf-8') as f:
    for line in f:
        w_to_i[line[:-1]] = idx
        idx += 1
        
i_to_w = {v:k for (k,v) in w_to_i.items()}

In [21]:
i_to_w[19557]

u'it'

In [13]:
lens = np.array(map(lambda x: len(x[1]), data))

In [27]:
def show_data(idx):
    for s in data[idx][1]:
        print ' '.join([i_to_w[x] for x in s])

In [28]:
show_data(60023)

what is the largest hottest continuously large area worldwide ?
the sky is usually clear above the desert and the sunshine duration is extremely high everywhere in the sahara .
most of the desert enjoys more than 3,600 h of bright sunshine annually or over 82 % of the time and a wide area in the eastern part experiences in excess of 4,000 h of bright sunshine a year or over 91 % of the time , and the highest values are very close to the theoretical maximum value .
a value of 4,300 h or 98 % of the time would be recorded in upper egypt ( aswan , luxor ) and in the nubian desert ( wadi halfa ) .
the annual average direct solar irradiation is around 2,800 kwh/ ( m2 year ) in the great desert .
the sahara has a huge potential for solar energy production .
the constantly high position of the sun , the extremely low relative humidity , the lack of vegetation and rainfall make the great desert the hottest continuously large area worldwide and certainly the hottest place on earth during summer

In [14]:
print max(lens)
np.bincount(lens)

28


array([    0,     0,  1665,  6015, 12958, 18663, 16891, 12176,  7727,
        4709,  2711,  1677,   859,   636,   352,   223,   166,    60,
          39,    24,     9,     0,     5,    19,     5,     0,     0,
           0,    10])