## Example for DRQA data processing

In [1]:
import re
import spacy
import tqdm
def clean_spaces(text):
    """normalize spaces in a string."""
    text = re.sub(r'\s', ' ', text)
    return text

In [152]:
nlp = spacy.load('en', parser=False)

In [59]:
c_doc = nlp('who')
[w.tag_ for w in c_doc]

['WP']

import unicodedata
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

In [4]:
context = 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
context

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [5]:
question = 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [6]:
import collections
q_doc = nlp(clean_spaces(question))
c_doc = nlp(clean_spaces(context))
question_tokens = [normalize_text(w.text) for w in q_doc]
context_tokens = [normalize_text(w.text) for w in c_doc]
question_tokens_lower = [w.lower() for w in question_tokens]
context_tokens_lower = [w.lower() for w in context_tokens]
context_token_span = [(w.idx, w.idx + len(w.text)) for w in c_doc] # the lenghth of each tokens
context_tags = [w.tag_ for w in c_doc] # POS tagging
context_ents = [w.ent_type_ for w in c_doc] # NER tagging

question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in q_doc}
# PRON is such as me/it/you
# lemma_ : cats -> cat

question_tokens_set = set(question_tokens)
question_tokens_lower_set = set(question_tokens_lower)
match_origin = [w in question_tokens_set for w in context_tokens]
match_lower = [w in question_tokens_lower_set for w in context_tokens_lower]
match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in c_doc]
# term frequency in document
counter_ = collections.Counter(context_tokens_lower)
total = len(context_tokens_lower)
context_tf = [counter_[w] / total for w in context_tokens_lower]
context_features = list(zip(match_origin, match_lower, match_lemma, context_tf))

In [121]:
# for example
context_tags

['RB',
 ',',
 'DT',
 'NN',
 'VBZ',
 'DT',
 'JJ',
 'NN',
 '.',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'POS',
 'NN',
 'NN',
 'VBZ',
 'DT',
 'JJ',
 'NN',
 'IN',
 'DT',
 'NNP',
 'NNP',
 '.',
 'RB',
 'IN',
 'NN',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'CC',
 'VBG',
 'PRP',
 ',',
 'VBZ',
 'DT',
 'NN',
 'NN',
 'IN',
 'NNP',
 'IN',
 'NNS',
 'JJ',
 'IN',
 'DT',
 'NN',
 '``',
 'NNP',
 'NNP',
 'PRP',
 'NNP',
 "''",
 '.',
 'RB',
 'IN',
 'DT',
 'NNP',
 'NNP',
 'VBZ',
 'DT',
 'NNP',
 'IN',
 'DT',
 'NNP',
 'NNP',
 '.',
 'RB',
 'IN',
 'DT',
 'NN',
 'VBZ',
 'DT',
 'NNP',
 ',',
 'DT',
 'NNP',
 'NN',
 'IN',
 'NN',
 'CC',
 'NN',
 '.',
 'PRP',
 'VBZ',
 'DT',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NNP',
 ',',
 'NNP',
 'WRB',
 'DT',
 'NNP',
 'NNP',
 'RB',
 'VBD',
 'IN',
 'NNP',
 'NNP',
 'NNP',
 'IN',
 'CD',
 '.',
 'IN',
 'DT',
 'NN',
 'IN',
 'DT',
 'JJ',
 'NN',
 '-LRB-',
 'CC',
 'IN',
 'DT',
 'JJ',
 'NN',
 'WDT',
 'VBZ',
 'IN',
 'CD',
 'NNS',
 'CC',
 'DT',
 'NNP',
 'NNP',
 '-RRB-',
 ',',
 'VBZ',
 'DT',
 'JJ',
 ',',
 'JJ',
 'N

In [8]:
context_features
# a new feature engineering: make the probablity of answer lager somehow

[(False, False, False, 0.007042253521126761),
 (False, False, False, 0.04225352112676056),
 (True, True, True, 0.1056338028169014),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (True, True, True, 0.1056338028169014),
 (False, False, False, 0.028169014084507043),
 (False, False, False, 0.02112676056338028),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.014084507042253521),
 (False, False, False, 0.014084507042253521),
 (False, False, False, 0.04225352112676056),
 (False, False, False, 0.04929577464788732),
 (False, False, False, 0.007042253521126761),
 (False, False, False, 0.02112676056338028),
 (False, False, False, 0.056338028169014086),
 (True, True, True, 0.1056338028169014),
 (True, Tr

In [9]:
def index_answer(row):
    token_span = row[-4]
    starts, ends = zip(*token_span)
    answer_start = row[-2]
    answer_end = row[-1]
    try:
        return row[:-3] + (starts.index(answer_start), ends.index(answer_end))
    except ValueError:
        return row[:-3] + (None, None)

In [10]:
token_span = context_token_span

In [11]:
starts, ends = zip(*token_span)

In [12]:
answer = 'Saint Bernadette Soubirous'
answer_start = 515
answer_end = answer_start + len(answer)

In [13]:
starts.index(answer_start)

102

In [14]:
ends.index(answer_end)

104

In [15]:
# pos tagging count for context
counter_tag = collections.Counter(w for w in context_tags) #context_tags

In [16]:
counter_tag

Counter({"''": 1,
         ',': 6,
         '-LRB-': 1,
         '-RRB-': 1,
         '.': 7,
         'CC': 4,
         'CD': 2,
         'DT': 22,
         'IN': 20,
         'JJ': 7,
         'NN': 20,
         'NNP': 27,
         'NNS': 2,
         'POS': 1,
         'PRP': 3,
         'RB': 5,
         'VBD': 1,
         'VBG': 1,
         'VBZ': 8,
         'WDT': 1,
         'WRB': 1,
         '``': 1})

In [17]:
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True)

In [18]:
tag2id = {w: i for i, w in enumerate(vocab_tag)}

In [19]:
# largest count with small index number
tag2id

{"''": 16,
 ',': 7,
 '-LRB-': 19,
 '-RRB-': 21,
 '.': 6,
 'CC': 9,
 'CD': 12,
 'DT': 1,
 'IN': 3,
 'JJ': 5,
 'NN': 2,
 'NNP': 0,
 'NNS': 11,
 'POS': 13,
 'PRP': 10,
 'RB': 8,
 'VBD': 18,
 'VBG': 14,
 'VBZ': 4,
 'WDT': 20,
 'WRB': 17,
 '``': 15}

In [378]:
import msgpack
with open('SQuAD/meta.msgpack', 'rb') as f:
    meta = msgpack.load(f, encoding='utf8')

In [413]:
import torch
embedding = torch.Tensor(meta['char_embeddings'])

In [414]:
embedding.size()

torch.Size([91187, 100])

In [26]:
# number of pos tag given by spacy 
len(meta['vocab_tag'])

50

In [27]:
# number of NER tag given by spacy
len(meta['vocab_ent'])

19

In [28]:
# pos tag
meta['vocab_tag']

['NN',
 'IN',
 'NNP',
 'DT',
 'JJ',
 'NNS',
 ',',
 '.',
 'CC',
 'VBD',
 'RB',
 'VBN',
 'CD',
 'VB',
 'VBZ',
 'VBG',
 'PRP',
 'VBP',
 'TO',
 '-RRB-',
 'HYPH',
 '-LRB-',
 'PRP$',
 'POS',
 'WDT',
 "''",
 '``',
 'MD',
 'NNPS',
 ':',
 'JJR',
 'JJS',
 'WRB',
 'WP',
 'RP',
 'RBR',
 'RBS',
 'EX',
 '$',
 'SYM',
 'FW',
 'NFP',
 'AFX',
 'PDT',
 'WP$',
 'UH',
 'LS',
 'SP',
 'XX',
 'ADD']

In [30]:
meta['vocab_ent']

['',
 'ORG',
 'DATE',
 'PERSON',
 'GPE',
 'CARDINAL',
 'NORP',
 'LOC',
 'WORK_OF_ART',
 'PERCENT',
 'EVENT',
 'ORDINAL',
 'MONEY',
 'FAC',
 'QUANTITY',
 'LAW',
 'TIME',
 'LANGUAGE',
 'PRODUCT']

In [407]:
with open('SQuAD/data.msgpack', 'rb') as f:
    data = msgpack.load(f, encoding='utf8')

In [408]:
# train: id, context_id, context_features, tag_id, ent_id,
#        question_id, context, context_token_span, answer_start, answer_end
data['train'][0]

['5733be284776f41900661182',
 [55739,
  18,
  3,
  137,
  47,
  12,
  523,
  991,
  216,
  78437,
  3,
  5448,
  2422,
  13,
  1676,
  10685,
  9,
  12,
  6226,
  4716,
  5,
  3,
  3696,
  711,
  216,
  26397,
  6,
  2070,
  5,
  3,
  5448,
  2422,
  17,
  6409,
  50,
  18,
  9,
  12,
  758,
  4716,
  5,
  2329,
  28,
  3114,
  78498,
  28,
  3,
  5904,
  39,
  73457,
  16884,
  7744,
  78000,
  39,
  216,
  23707,
  7,
  3,
  5448,
  2422,
  9,
  3,
  4483,
  5,
  3,
  9842,
  4840,
  216,
  26397,
  1584,
  3,
  10535,
  9,
  3,
  18336,
  18,
  12,
  12033,
  122,
  5,
  5267,
  17,
  9167,
  216,
  2263,
  9,
  12,
  12551,
  5,
  3,
  63227,
  45,
  18335,
  18,
  197,
  110,
  3,
  3696,
  711,
  43956,
  1911,
  7,
  642,
  49069,
  49084,
  6,
  8898,
  216,
  217,
  3,
  140,
  5,
  3,
  146,
  2053,
  920,
  17,
  6,
  12,
  1503,
  392,
  22,
  2942,
  243,
  801,
  6547,
  17,
  3,
  4660,
  14014,
  825,
  18,
  9,
  12,
  2737,
  18,
  205,
  2739,
  4716,
  5,
  711,
  2

In [33]:
# context
data['train'][0][6]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [None]:
# for example cotext id 
data['train'][0][1]

In [35]:
# for example pos tag_id
data['train'][0][3]

[10,
 6,
 3,
 0,
 14,
 3,
 4,
 0,
 7,
 1,
 3,
 2,
 2,
 23,
 0,
 0,
 14,
 3,
 4,
 0,
 1,
 3,
 2,
 2,
 7,
 10,
 1,
 0,
 1,
 3,
 2,
 2,
 8,
 15,
 16,
 6,
 14,
 3,
 0,
 0,
 1,
 2,
 1,
 5,
 4,
 1,
 3,
 0,
 26,
 2,
 2,
 16,
 2,
 25,
 7,
 10,
 1,
 3,
 2,
 2,
 14,
 3,
 2,
 1,
 3,
 2,
 2,
 7,
 10,
 1,
 3,
 0,
 14,
 3,
 2,
 6,
 3,
 2,
 0,
 1,
 0,
 8,
 0,
 7,
 16,
 14,
 3,
 0,
 1,
 3,
 0,
 1,
 2,
 6,
 2,
 32,
 3,
 2,
 2,
 10,
 9,
 1,
 2,
 2,
 2,
 1,
 12,
 7,
 1,
 3,
 0,
 1,
 3,
 4,
 0,
 21,
 8,
 1,
 3,
 4,
 0,
 24,
 14,
 1,
 12,
 5,
 8,
 3,
 2,
 2,
 19,
 6,
 14,
 3,
 4,
 6,
 4,
 0,
 0,
 1,
 2,
 7]

In [36]:
meta['vocab'][53946]

'Architecturally'

In [37]:
meta['vocab_tag'][10]

'RB'

In [38]:
meta['embedding'][10]

[-0.068894,
 0.38769,
 -0.2612,
 -0.13737,
 -0.2154,
 0.16583,
 -0.057185,
 -0.1818,
 -0.091393,
 3.0152,
 0.0080077,
 -0.14678,
 0.59703,
 0.13855,
 -0.38471,
 -0.18226,
 -0.048115,
 0.19229,
 -0.39827,
 -0.069427,
 0.44313,
 0.039526,
 0.12246,
 -0.0095958,
 -0.27647,
 0.052025,
 -0.20317,
 -0.24843,
 0.18102,
 -0.146,
 -0.18892,
 0.29503,
 0.033175,
 0.017062,
 -0.050974,
 -0.10416,
 -0.083443,
 -0.036962,
 -0.31562,
 -0.25156,
 -0.078766,
 0.3288,
 0.0047974,
 -0.17029,
 0.25234,
 -0.043896,
 -0.3515,
 0.044745,
 0.30536,
 0.0063232,
 -0.13202,
 -0.0034639,
 0.23588,
 0.080575,
 0.32261,
 -0.11584,
 -0.079779,
 -0.21162,
 0.032221,
 -0.023761,
 -0.060977,
 -0.24384,
 0.092764,
 0.59417,
 0.050349,
 -0.076585,
 0.024472,
 0.063355,
 0.52163,
 0.14607,
 0.39275,
 0.40531,
 0.1003,
 -0.042093,
 0.22654,
 0.12735,
 -0.0407,
 -0.21567,
 -0.073999,
 0.20608,
 0.052384,
 -0.053651,
 -0.068521,
 -0.19867,
 0.032769,
 -0.2159,
 0.24419,
 -0.88759,
 0.079682,
 0.1556,
 -0.17208,
 -0.13209,
 

In [422]:
 len(meta['vocab']) == len(meta['char_embeddings']) ==91187

True

### Character level embedding

In [None]:
pwd

In [None]:
size = 874474
d_emb = 100
seen = set()
fin_name = 'char/charNgram.txt'
with open(fin_name, 'r') as ftxt:
    content = ftxt.read()
    lines = content.splitlines()
    batch = []

In [None]:
for line in lines:
    elems = line.rstrip().split()
    vec = [float(n) for n in elems[-d_emb:]]
    word = ' '.join(elems[:-d_emb])
    if word in seen:
        continue
    seen.add(word)
    batch.append((word, vec))

In [None]:
def ngrams(sentence, n):
    """
    Returns:
        list: a list of lists of words corresponding to the ngrams in the sentence.
    """
    return [sentence[i:i+n] for i in range(len(sentence)-n+1)]

In [None]:
def emb(w, default='zero'):
    assert default == 'zero', 'only zero default is supported for character embeddings'
    chars = ['#BEGIN#'] + list(w) + ['#END#']
    embs = np.zeros(d_emb, dtype=np.float32)
    match = {}
    for i in [2, 3, 4]:
        grams = ngrams(chars, i)
        for g in grams:
            g = '{}gram-{}'.format(i, ''.join(g))
            e = self.lookup(g)
            if e is not None:
                match[g] = np.array(e, np.float32)
    if match:
        embs = sum(match.values()) / len(match)
    return embs.tolist()

In [None]:
chars = ['#BEGIN#'] + list('cat') + ['#END#']

In [None]:
chars

In [None]:
embs = np.zeros(d_emb, dtype=np.float32)
embs

In [None]:
match = {}
for i in [2, 3, 4]:
    grams = ngrams(chars, i)
    for g in grams:
        g = '{}gram-{}'.format(i, ''.join(g))
        print(g)
        #e = lookup(g)
        #if e is not None:
        #    match[g] = np.array(e, np.float32)

# prepro.py

In [391]:
trn_file = 'SQuAD/train-v1.1.json'
import json

def flatten_json(data_file, mode):
    """Flatten each article in training data."""
    with open(data_file) as f:
        data = json.load(f)['data']
    rows = []
    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                id_, question, answers = qa['id'], qa['question'], qa['answers']
                if mode == 'train':
                    answer = answers[0]['text']  # in training data there's only one answer
                    answer_start = answers[0]['answer_start'] # char level length
                    answer_end = answer_start + len(answer) # char level lenght
                    rows.append((id_, context, question, answer, answer_start, answer_end))
                else:  # mode == 'dev'
                    answers = [a['text'] for a in answers]
                    rows.append((id_, context, question, answers))
    return rows


train = flatten_json(trn_file, 'train')

In [392]:
train[1]

('5733be284776f4190066117f',
 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'What is in front of the Notre Dame Main Building?',
 'a copper statue of Christ',
 188,
 213)

In [393]:
def clean_spaces(text):
    """normalize spaces in a string."""
    text = re.sub(r'\s', ' ', text)
    return text


def normalize_text(text):
    return unicodedata.normalize('NFD', text)


nlp = None


def init():
    """initialize spacy in each process"""
    '''
    'en': Noun chunks are "base noun phrases" – flat phrases that have a noun as their head.
    parser=False or disable=['parser'] : don't need any of the syntactic information,
                                        and will make spaCy load and run much faster.
    '''
    global nlp
    nlp = spacy.load('en', parser=False)


def annotate(row):
    global nlp
    id_, context, question = row[:3]
    q_doc = nlp(clean_spaces(question))
    c_doc = nlp(clean_spaces(context))
    question_tokens = [normalize_text(w.text) for w in q_doc]
    context_tokens = [normalize_text(w.text) for w in c_doc]
    question_tokens_lower = [w.lower() for w in question_tokens]
    context_tokens_lower = [w.lower() for w in context_tokens]
    context_token_span = [(w.idx, w.idx + len(w.text)) for w in c_doc] # the lenghth of each tokens
    context_tags = [w.tag_ for w in c_doc] # POS tagging
    context_ents = [w.ent_type_ for w in c_doc] # NER tagging

    question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in q_doc}
    # PRON is such as me/it/you
    # lemma_ : cats -> cat

    question_tokens_set = set(question_tokens)
    question_tokens_lower_set = set(question_tokens_lower)
    match_origin = [w in question_tokens_set for w in context_tokens]
    match_lower = [w in question_tokens_lower_set for w in context_tokens_lower]
    match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in c_doc]
    # term frequency in document
    counter_ = collections.Counter(context_tokens_lower)
    total = len(context_tokens_lower)
    # frequent feature
    context_tf = [counter_[w] / total for w in context_tokens_lower]
    # exact match feature refering to the paper
    context_features = list(zip(match_origin, match_lower, match_lemma, context_tf))
    if not True:
        context_tokens = context_tokens_lower
        question_tokens = question_tokens_lower
    return (id_, context_tokens, context_features, context_tags, context_ents,
            question_tokens, context, context_token_span) + row[3:]


def index_answer(row):
    token_span = row[-4] #context_token_span
    starts, ends = zip(*token_span)
    answer_start = row[-2]
    answer_end = row[-1]
    try:
        return row[:-3] + (starts.index(answer_start), ends.index(answer_end))
    except ValueError:
        return row[:-3] + (None, None)

In [394]:
train[1]

('5733be284776f4190066117f',
 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'What is in front of the Notre Dame Main Building?',
 'a copper statue of Christ',
 188,
 213)

In [395]:
nlp = spacy.load('en', parser=False)
train_ann = annotate(train[1])

In [399]:
train_ann

('5733be284776f4190066117f',
 ['Architecturally',
  ',',
  'the',
  'school',
  'has',
  'a',
  'Catholic',
  'character',
  '.',
  'Atop',
  'the',
  'Main',
  'Building',
  "'s",
  'gold',
  'dome',
  'is',
  'a',
  'golden',
  'statue',
  'of',
  'the',
  'Virgin',
  'Mary',
  '.',
  'Immediately',
  'in',
  'front',
  'of',
  'the',
  'Main',
  'Building',
  'and',
  'facing',
  'it',
  ',',
  'is',
  'a',
  'copper',
  'statue',
  'of',
  'Christ',
  'with',
  'arms',
  'upraised',
  'with',
  'the',
  'legend',
  '"',
  'Venite',
  'Ad',
  'Me',
  'Omnes',
  '"',
  '.',
  'Next',
  'to',
  'the',
  'Main',
  'Building',
  'is',
  'the',
  'Basilica',
  'of',
  'the',
  'Sacred',
  'Heart',
  '.',
  'Immediately',
  'behind',
  'the',
  'basilica',
  'is',
  'the',
  'Grotto',
  ',',
  'a',
  'Marian',
  'place',
  'of',
  'prayer',
  'and',
  'reflection',
  '.',
  'It',
  'is',
  'a',
  'replica',
  'of',
  'the',
  'grotto',
  'at',
  'Lourdes',
  ',',
  'France',
  'where',
  

In [397]:
train = index_answer(train_ann)

In [137]:
wv_vocab = set()
with open('glove/glove.840B.300d.txt') as f:
    for line in f:
        token = normalize_text(line.rstrip().split(' ')[0])
        wv_vocab.add(token)

In [161]:
len(wv_vocab)

2195960

In [171]:
def build_vocab(questions, contexts):
    """
    Build vocabulary sorted by global word frequency, or consider frequencies in questions first,
    which is controlled by `args.sort_all`.
    """
    if True:
        counter = collections.Counter(w for doc in questions + contexts for w in doc)
        vocab = sorted([t for t in counter if t in wv_vocab], key=counter.get, reverse=True)
    else:
        counter_q = collections.Counter(w for doc in questions for w in doc)
        counter_c = collections.Counter(w for doc in contexts for w in doc)
        counter = counter_c + counter_q
        vocab = sorted([t for t in counter_q if t in wv_vocab], key=counter_q.get, reverse=True)
        vocab += sorted([t for t in counter_c.keys() - counter_q.keys() if t in wv_vocab],
                        key=counter.get, reverse=True)
    total = sum(counter.values())
    matched = sum(counter[t] for t in vocab)
    vocab.insert(0, "<PAD>") # in question_id and context_id, the 0 means padding
    vocab.insert(1, "<UNK>")
    return vocab, counter

In [231]:
counter = collections.Counter(w for doc in [row[5]] + [row[1]] for w in doc)

In [232]:
vocab = sorted([t for t in counter if t in wv_vocab], key=counter.get, reverse=True)

In [240]:
total = sum(counter.values())
matched = sum(counter[t] for t in vocab)
matched == total

True

In [209]:
row = train
full = train

In [359]:
vocab, counter = build_vocab([row[5]], [row[1]])
counter_tag = collections.Counter(w for w in row[3]) #context_tags
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True) # high rank with larger count
counter_ent = collections.Counter(w for w in row[4])
vocab_ent = sorted(counter_ent, key=counter_ent.get, reverse=True)
w2id = {w: i for i, w in enumerate(vocab)}
tag2id = {w: i for i, w in enumerate(vocab_tag)} # larger count(hight rank) with small index
ent2id = {w: i for i, w in enumerate(vocab_ent)}

In [361]:
len(vocab)

87

In [362]:
def to_id(row, unk_id=1):
    context_tokens = row[1]
    context_features = row[2]
    context_tags = row[3]
    context_ents = row[4]
    question_tokens = row[5]
    question_ids = [w2id[w] if w in w2id else unk_id for w in question_tokens]
    context_ids = [w2id[w] if w in w2id else unk_id for w in context_tokens]
    tag_ids = [tag2id[w] for w in context_tags]
    ent_ids = [ent2id[w] for w in context_ents]
    return (row[0], context_ids, context_features, tag_ids, ent_ids, question_ids) + row[6:]

In [266]:
train_ann_id = to_id(train, unk_id=1)

In [None]:
vocab_size = len(vocab)
embeddings = np.zeros((vocab_size, 300))
embed_counts = np.zeros(vocab_size)
embed_counts[:2] = 1  # PADDING & UNK
wv_file = 'glove/glove.840B.300d.txt'
with open(wv_file) as f:
    for line in f:
        elems = line.rstrip().split(' ')
        token = normalize_text(elems[0])
        if token in w2id:
            word_id = w2id[token]
            embed_counts[word_id] += 1
            embeddings[word_id] += [float(v) for v in elems[1:]]
embeddings /= embed_counts.reshape((-1, 1))

In [275]:
print(len(embeddings))
print(len(embeddings[1]))

87
300


## add char embedding 

In [315]:
def ngrams(sentence, n):
    """
    Returns:
        list: a list of lists of words corresponding to the ngrams in the sentence.
    """
    return [sentence[i:i+n] for i in range(len(sentence)-n+1)]

In [349]:
from embeddings.embedding import Embedding
class CharEmbedding(Embedding):

    size = 874474
    d_emb = 100

    def __init__(self):

        self.db = self.initialize_db(self.path('char/kazuma.db'))
        if len(self) < self.size:
            self.clear()
            self.load_word2emb()
            

    def emb(self, w, default='zero'):
        assert default == 'zero', 'only zero default is supported for character embeddings'
        chars = ['#BEGIN#'] + list(w) + ['#END#']
        embs = np.zeros(self.d_emb, dtype=np.float32)
        match = {}
        for i in [2, 3, 4]:
            grams = ngrams(chars, i)
            for g in grams:
                g = '{}gram-{}'.format(i, ''.join(g))
                e = self.lookup(g)
                if e is not None:
                    match[g] = np.array(e, np.float32)
        if match:
            embs = sum(match.values()) / len(match)
        return embs.tolist()

    def load_word2emb(self, batch_size=1000):
        seen = set()
        fin_name = 'char/charNgram.txt'
        with open(fin_name, 'r') as ftxt:
            content = ftxt.read()
            lines = content.splitlines()
            batch = []
            for line in lines:
                elems = line.rstrip().split()
                vec = [float(n) for n in elems[-d_emb:]]
                word = ' '.join(elems[:-d_emb])
                if word in seen:
                    continue
                seen.add(word)                
                batch.append((word, vec))
                if len(batch) == batch_size:
                    self.insert_batch(batch)
                    batch.clear()
            if batch:
                self.insert_batch(batch)
charembedding = CharEmbedding()

In [366]:
vocab_size = len(vocab)
char_embeddings = np.zeros((vocab_size, 100))
char_embed_counts = np.zeros(vocab_size)
char_embed_counts[:2] = 1  # PADDING & UNK
for token in w2id:
    word_id = w2id[token]
    char_embed_counts[word_id] += 1
    char_embeddings[word_id] += charembedding.emb(token) 
char_embeddings /= char_embed_counts.reshape((-1, 1))    

In [445]:
len(char_embeddings)

87

In [446]:
len(embeddings)

87

In [452]:
glove_char_embedding = np.concatenate((embeddings, char_embeddings), axis=1)

In [454]:
len(glove_char_embedding[1])

400

In [455]:
aa = glove_char_embedding.tolist()

In [451]:
a = np.array([[1, 2, 1], [3, 4, 1]])
b = np.array([[5, 6], [1,2]])
np.concatenate((a, b), axis=1)

array([[1, 2, 1, 5, 6],
       [3, 4, 1, 1, 2]])