## Example for DRQA data processing

In [20]:
import re
import spacy
def clean_spaces(text):
    """normalize spaces in a string."""
    text = re.sub(r'\s', ' ', text)
    return text

In [50]:
nlp = spacy.load('en', parser=False)

In [22]:
import unicodedata
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

In [87]:
context = 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
context

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [52]:
question = 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [53]:
import collections
q_doc = nlp(clean_spaces(question))
c_doc = nlp(clean_spaces(context))
question_tokens = [normalize_text(w.text) for w in q_doc]
context_tokens = [normalize_text(w.text) for w in c_doc]
question_tokens_lower = [w.lower() for w in question_tokens]
context_tokens_lower = [w.lower() for w in context_tokens]
context_token_span = [(w.idx, w.idx + len(w.text)) for w in c_doc] # the lenghth of each tokens
context_tags = [w.tag_ for w in c_doc] # POS tagging
context_ents = [w.ent_type_ for w in c_doc] # NER tagging

question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in q_doc}
# PRON is such as me/it/you
# lemma_ : cats -> cat

question_tokens_set = set(question_tokens)
question_tokens_lower_set = set(question_tokens_lower)
match_origin = [w in question_tokens_set for w in context_tokens]
match_lower = [w in question_tokens_lower_set for w in context_tokens_lower]
match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in c_doc]
# term frequency in document
counter_ = collections.Counter(context_tokens_lower)
total = len(context_tokens_lower)
context_tf = [counter_[w] / total for w in context_tokens_lower]
context_features = list(zip(match_origin, match_lower, match_lemma, context_tf))

In [55]:
# for example: pos tagging for context
context_tags

['RB',
 ',',
 'DT',
 'NN',
 'VBZ',
 'DT',
 'JJ',
 'NN',
 '.',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'POS',
 'NN',
 'NN',
 'VBZ',
 'DT',
 'JJ',
 'NN',
 'IN',
 'DT',
 'NNP',
 'NNP',
 '.',
 'RB',
 'IN',
 'NN',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'CC',
 'VBG',
 'PRP',
 ',',
 'VBZ',
 'DT',
 'NN',
 'NN',
 'IN',
 'NNP',
 'IN',
 'NNS',
 'JJ',
 'IN',
 'DT',
 'NN',
 '``',
 'NNP',
 'NNP',
 'PRP',
 'NNP',
 "''",
 '.',
 'RB',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'VBZ',
 'DT',
 'NNP',
 'IN',
 'DT',
 'NNP',
 'NNP',
 '.',
 'RB',
 'IN',
 'DT',
 'NN',
 'VBZ',
 'DT',
 'NNP',
 ',',
 'DT',
 'NNP',
 'NN',
 'IN',
 'NN',
 'CC',
 'NN',
 '.',
 'PRP',
 'VBZ',
 'DT',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NNP',
 ',',
 'NNP',
 'WRB',
 'DT',
 'NNP',
 'NNP',
 'RB',
 'VBD',
 'IN',
 'NNP',
 'NNP',
 'NNP',
 'IN',
 'CD',
 '.',
 'IN',
 'DT',
 'NN',
 'IN',
 'DT',
 'JJ',
 'NN',
 '-LRB-',
 'CC',
 'IN',
 'DT',
 'JJ',
 'NN',
 'WDT',
 'VBZ',
 'IN',
 'CD',
 'NNS',
 'CC',
 'DT',
 'NNP',
 'NNP',
 '-RRB-',
 ',',
 'VBZ',
 'DT',
 'JJ',
 ',',
 'JJ',
 'N

In [26]:
context_features
# a new feature engineering: make the probablity of answer lager somehow

[(False, False, False, 0.007042253521126761),
 (False, False, False, 0.04225352112676056),
 (True, True, True, 0.1056338028169014),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (True, True, True, 0.1056338028169014),
 (False, False, False, 0.028169014084507043),
 (False, False, False, 0.02112676056338028),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.014084507042253521),
 (False, False, False, 0.014084507042253521),
 (False, False, False, 0.04225352112676056),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.02112676056338028),
 (False, False, False, 0.056338028169014086),
 (True, True, True, 0.1056338028169014),
 (True, Tr

In [56]:
def index_answer(row):
    token_span = row[-4]
    starts, ends = zip(*token_span)
    answer_start = row[-2]
    answer_end = row[-1]
    try:
        return row[:-3] + (starts.index(answer_start), ends.index(answer_end))
    except ValueError:
        return row[:-3] + (None, None)

In [57]:
token_span = context_token_span

In [58]:
starts, ends = zip(*token_span)

In [59]:
answer = 'Saint Bernadette Soubirous'
answer_start = 515
answer_end = answer_start + len(answer)

In [60]:
starts.index(answer_start)

102

In [61]:
ends.index(answer_end)

104

In [64]:
# pos tagging count for context
counter_tag = collections.Counter(w for w in context_tags) #context_tags

In [99]:
counter_tag

Counter({"''": 1,
         ',': 6,
         '-LRB-': 1,
         '-RRB-': 1,
         '.': 7,
         'CC': 4,
         'CD': 2,
         'DT': 22,
         'IN': 20,
         'JJ': 7,
         'NN': 20,
         'NNP': 27,
         'NNS': 2,
         'POS': 1,
         'PRP': 3,
         'RB': 5,
         'VBD': 1,
         'VBG': 1,
         'VBZ': 8,
         'WDT': 1,
         'WRB': 1,
         '``': 1})

In [100]:
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True)

In [101]:
tag2id = {w: i for i, w in enumerate(vocab_tag)}

In [102]:
# largest count with small index number
tag2id

{"''": 16,
 ',': 7,
 '-LRB-': 19,
 '-RRB-': 21,
 '.': 6,
 'CC': 9,
 'CD': 12,
 'DT': 1,
 'IN': 3,
 'JJ': 5,
 'NN': 2,
 'NNP': 0,
 'NNS': 11,
 'POS': 13,
 'PRP': 10,
 'RB': 8,
 'VBD': 18,
 'VBG': 14,
 'VBZ': 4,
 'WDT': 20,
 'WRB': 17,
 '``': 15}

In [3]:
import msgpack
with open('SQuAD/meta.msgpack', 'rb') as f:
    meta = msgpack.load(f, encoding='utf8')

In [4]:
import torch
embedding = torch.Tensor(meta['embedding'])

In [5]:
embedding.size()

torch.Size([91187, 300])

In [6]:
# number of pos tag given by spacy 
len(meta['vocab_tag'])

50

In [7]:
# number of NER tag given by spacy
len(meta['vocab_ent'])

19

In [47]:
# pos tag
meta['vocab_tag']

['NN',
 'IN',
 'NNP',
 'DT',
 'JJ',
 'NNS',
 ',',
 '.',
 'CC',
 'VBD',
 'RB',
 'VBN',
 'CD',
 'VB',
 'VBZ',
 'VBG',
 'PRP',
 'VBP',
 'TO',
 '-RRB-',
 'HYPH',
 '-LRB-',
 'PRP$',
 'POS',
 'WDT',
 "''",
 '``',
 'MD',
 'NNPS',
 ':',
 'JJR',
 'JJS',
 'WRB',
 'WP',
 'RP',
 'RBR',
 'RBS',
 'EX',
 '$',
 'SYM',
 'FW',
 'NFP',
 'AFX',
 'PDT',
 'WP$',
 'UH',
 'LS',
 'SP',
 'XX',
 'ADD']

In [144]:
meta['vocab_ent']

['',
 'ORG',
 'DATE',
 'PERSON',
 'GPE',
 'CARDINAL',
 'NORP',
 'LOC',
 'WORK_OF_ART',
 'PERCENT',
 'EVENT',
 'ORDINAL',
 'MONEY',
 'FAC',
 'QUANTITY',
 'LAW',
 'TIME',
 'LANGUAGE',
 'PRODUCT']

In [13]:
with open('SQuAD/data.msgpack', 'rb') as f:
    data = msgpack.load(f, encoding='utf8')

In [14]:
# train: id, context_id, context_features, tag_id, ent_id,
#        question_id, context, context_token_span, answer_start, answer_end
data['train'][0]

['5733be284776f41900661182',
 [53946,
  18,
  3,
  137,
  47,
  12,
  523,
  991,
  216,
  69332,
  3,
  5448,
  2422,
  13,
  1676,
  10685,
  9,
  12,
  6226,
  4716,
  5,
  3,
  3696,
  711,
  216,
  26397,
  6,
  2070,
  5,
  3,
  5448,
  2422,
  17,
  6409,
  50,
  18,
  9,
  12,
  758,
  4716,
  5,
  2329,
  28,
  3114,
  68972,
  28,
  3,
  5904,
  39,
  79622,
  16884,
  7744,
  66049,
  39,
  216,
  23707,
  7,
  3,
  5448,
  2422,
  9,
  3,
  4483,
  5,
  3,
  9842,
  4840,
  216,
  26397,
  1584,
  3,
  10535,
  9,
  3,
  18336,
  18,
  12,
  12033,
  122,
  5,
  5267,
  17,
  9167,
  216,
  2263,
  9,
  12,
  12551,
  5,
  3,
  66120,
  45,
  18335,
  18,
  197,
  110,
  3,
  3696,
  711,
  44044,
  1911,
  7,
  642,
  48558,
  49020,
  6,
  8898,
  216,
  217,
  3,
  140,
  5,
  3,
  146,
  2053,
  920,
  17,
  6,
  12,
  1503,
  392,
  22,
  2942,
  243,
  801,
  6547,
  17,
  3,
  4660,
  14014,
  825,
  18,
  9,
  12,
  2737,
  18,
  205,
  2739,
  4716,
  5,
  711,
  2

In [15]:
# context
data['train'][0][6]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [16]:
# for example cotext id 
data['train'][0][1]

[53946,
 18,
 3,
 137,
 47,
 12,
 523,
 991,
 216,
 69332,
 3,
 5448,
 2422,
 13,
 1676,
 10685,
 9,
 12,
 6226,
 4716,
 5,
 3,
 3696,
 711,
 216,
 26397,
 6,
 2070,
 5,
 3,
 5448,
 2422,
 17,
 6409,
 50,
 18,
 9,
 12,
 758,
 4716,
 5,
 2329,
 28,
 3114,
 68972,
 28,
 3,
 5904,
 39,
 79622,
 16884,
 7744,
 66049,
 39,
 216,
 23707,
 7,
 3,
 5448,
 2422,
 9,
 3,
 4483,
 5,
 3,
 9842,
 4840,
 216,
 26397,
 1584,
 3,
 10535,
 9,
 3,
 18336,
 18,
 12,
 12033,
 122,
 5,
 5267,
 17,
 9167,
 216,
 2263,
 9,
 12,
 12551,
 5,
 3,
 66120,
 45,
 18335,
 18,
 197,
 110,
 3,
 3696,
 711,
 44044,
 1911,
 7,
 642,
 48558,
 49020,
 6,
 8898,
 216,
 217,
 3,
 140,
 5,
 3,
 146,
 2053,
 920,
 17,
 6,
 12,
 1503,
 392,
 22,
 2942,
 243,
 801,
 6547,
 17,
 3,
 4660,
 14014,
 825,
 18,
 9,
 12,
 2737,
 18,
 205,
 2739,
 4716,
 5,
 711,
 216]

In [117]:
# for example pos tag_id
data['train'][0][3]

[10,
 6,
 3,
 0,
 14,
 3,
 4,
 0,
 7,
 1,
 3,
 2,
 2,
 23,
 0,
 0,
 14,
 3,
 4,
 0,
 1,
 3,
 2,
 2,
 7,
 10,
 1,
 0,
 1,
 3,
 2,
 2,
 8,
 15,
 16,
 6,
 14,
 3,
 0,
 0,
 1,
 2,
 1,
 5,
 4,
 1,
 3,
 0,
 26,
 2,
 2,
 16,
 2,
 25,
 7,
 10,
 1,
 3,
 2,
 2,
 14,
 3,
 2,
 1,
 3,
 2,
 2,
 7,
 10,
 1,
 3,
 0,
 14,
 3,
 2,
 6,
 3,
 2,
 0,
 1,
 0,
 8,
 0,
 7,
 16,
 14,
 3,
 0,
 1,
 3,
 0,
 1,
 2,
 6,
 2,
 32,
 3,
 2,
 2,
 10,
 9,
 1,
 2,
 2,
 2,
 1,
 12,
 7,
 1,
 3,
 0,
 1,
 3,
 4,
 0,
 21,
 8,
 1,
 3,
 4,
 0,
 24,
 14,
 1,
 12,
 5,
 8,
 3,
 2,
 2,
 19,
 6,
 14,
 3,
 4,
 6,
 4,
 0,
 0,
 1,
 2,
 7]

In [10]:
meta['vocab'][53946]

'Architecturally'

In [11]:
meta['vocab_tag'][10]

'RB'

In [17]:
meta['embedding'][10]

[-0.068894,
 0.38769,
 -0.2612,
 -0.13737,
 -0.2154,
 0.16583,
 -0.057185,
 -0.1818,
 -0.091393,
 3.0152,
 0.0080077,
 -0.14678,
 0.59703,
 0.13855,
 -0.38471,
 -0.18226,
 -0.048115,
 0.19229,
 -0.39827,
 -0.069427,
 0.44313,
 0.039526,
 0.12246,
 -0.0095958,
 -0.27647,
 0.052025,
 -0.20317,
 -0.24843,
 0.18102,
 -0.146,
 -0.18892,
 0.29503,
 0.033175,
 0.017062,
 -0.050974,
 -0.10416,
 -0.083443,
 -0.036962,
 -0.31562,
 -0.25156,
 -0.078766,
 0.3288,
 0.0047974,
 -0.17029,
 0.25234,
 -0.043896,
 -0.3515,
 0.044745,
 0.30536,
 0.0063232,
 -0.13202,
 -0.0034639,
 0.23588,
 0.080575,
 0.32261,
 -0.11584,
 -0.079779,
 -0.21162,
 0.032221,
 -0.023761,
 -0.060977,
 -0.24384,
 0.092764,
 0.59417,
 0.050349,
 -0.076585,
 0.024472,
 0.063355,
 0.52163,
 0.14607,
 0.39275,
 0.40531,
 0.1003,
 -0.042093,
 0.22654,
 0.12735,
 -0.0407,
 -0.21567,
 -0.073999,
 0.20608,
 0.052384,
 -0.053651,
 -0.068521,
 -0.19867,
 0.032769,
 -0.2159,
 0.24419,
 -0.88759,
 0.079682,
 0.1556,
 -0.17208,
 -0.13209,
 

In [18]:
len(meta['embedding'][10])

300