## Example for DRQA data processing

In [1]:
import re
import json
import spacy
import msgpack
import unicodedata
import numpy as np
import argparse
import collections
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm
import logging


In [2]:
import re
import spacy
import tqdm
def clean_spaces(text):
    """normalize spaces in a string."""
    text = re.sub(r'\s', ' ', text)
    return text

In [3]:
def iob_np_tag(tag_list):
    '''
    @in: a list of POS tags
    @out: iob_np tags
    '''
    iob_np = ['o_np'] * len(tag_list)
    for i in range(len(tag_list)):
        if 'NN' in tag_list[i]:
            if iob_np[i-1] == 'b_np':
                iob_np[i] = 'i_np'
            elif iob_np[i-1] == 'i_np':
                iob_np[i] = 'i_np'
            else:
                iob_np[i] = 'b_np'       
        i +=1
    return iob_np

def iob_ner_tag(tag_list):
    '''
    @in: a list of ner tags
    @out: iob_ner tags
    '''
    iob_ner = ['o_ner'] * len(tag_list)
    for i in range(len(tag_list)):
        if len(tag_list[i]) != 0:
            if iob_ner[i-1] == 'b_ner':
                iob_ner[i] = 'i_ner'
            elif iob_ner[i-1] == 'i_ner':
                iob_ner[i] = 'i_ner'
            else:
                iob_ner[i] = 'b_ner'       
        i +=1
    return iob_ner

In [4]:
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

In [5]:
global nlp
nlp = spacy.load('en', parser=False)

In [6]:
c_doc = nlp('Microsoft Corporation is a technology company founded in 1975. This corporation develops computer software.')
[w.tag_ for w in c_doc]

['NNP',
 'NNP',
 'VBZ',
 'DT',
 'NN',
 'NN',
 'VBN',
 'IN',
 'CD',
 '.',
 'DT',
 'NN',
 'VBZ',
 'NN',
 'NN',
 '.']

In [43]:
[w.ent_type_ for w in c_doc]

['ORG', 'ORG', '', '', '', '', '', '', 'DATE', '', '', '', '', '', '', '']

import unicodedata
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

In [25]:
context = 'Architecturally, the building has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
context

'Architecturally, the building has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [26]:
question = 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [27]:
import collections
q_doc = nlp(clean_spaces(question))
c_doc = nlp(clean_spaces(context))
question_tokens = [normalize_text(w.text) for w in q_doc]
context_tokens = [normalize_text(w.text) for w in c_doc]
question_tokens_lower = [w.lower() for w in question_tokens]
context_tokens_lower = [w.lower() for w in context_tokens]
context_token_span = [(w.idx, w.idx + len(w.text)) for w in c_doc] # the lenghth of each tokens
context_tags = [w.tag_ for w in c_doc] # POS tagging
context_ents = [w.ent_type_ for w in c_doc] # NER tagging
context_iob_np = iob_np_tag(context_tags)
context_iob_ner = iob_ner_tag(context_ents)

question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in q_doc}
# PRON is such as me/it/you
# lemma_ : cats -> cat

question_tokens_set = set(question_tokens)
question_tokens_lower_set = set(question_tokens_lower)
match_origin = [w in question_tokens_set for w in context_tokens]
match_lower = [w in question_tokens_lower_set for w in context_tokens_lower]
match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in c_doc]
# term frequency in document
counter_ = collections.Counter(context_tokens_lower)
total = len(context_tokens_lower)
context_tf = [counter_[w] / total for w in context_tokens_lower]
context_features = list(zip(match_origin, match_lower, match_lemma, context_tf))

Catholic Main Building Main Building Christ Venite Ad Me Omnes Main Building Basilica Sacred Heart Grotto Marian Lourdes France Mary Saint Bernadette Soubirous 1858 end main drive 3 Dome Mary

In [133]:
#ner_context_lemma + ner_context

In [134]:
stop_words = ['a', 'an', 'the', 'of', 'for', '\'s', ]
def part_ner_tag(tag_list, context_list):
    '''
    @in: a list of ner tags
    @out: part of ner tags
    '''
    ner_context = []
    part_ner = ['o_ner'] * len(tag_list)
    for i in range(len(tag_list)):
        if len(tag_list[i]) != 0 and context_list[i] not in stop_words:
            part_ner[i] = 'i_ner'
            ner_context.append(context_list[i])
    
    # combine lemma to ner_context list
    ner_context_str = ' '.join(ner_context)
    ner_context_ = nlp(ner_context_str)
    ner_context_lemma = [w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in ner_context_]
    ner_context_all = ner_context_lemma +  ner_context
        
    for j in range(len(context_list)):
        if context_list[j] in ner_context_all:
            part_ner[j] = 'i_ner'
    return part_ner, ner_context_all

In [137]:
part_ner ,ner_context_all= part_ner_tag(context_ents, context_tokens)

In [138]:
import pandas as pd
df = pd.DataFrame(np.column_stack([context_ents, context_tokens, part_ner]))
print (df.to_string())

               0                1      2
0                 Architecturally  o_ner
1                               ,  o_ner
2                             the  o_ner
3                        building  i_ner
4                             has  o_ner
5                               a  o_ner
6           NORP         Catholic  i_ner
7                       character  o_ner
8                               .  o_ner
9                            Atop  o_ner
10           FAC              the  o_ner
11           FAC             Main  i_ner
12           FAC         Building  i_ner
13           FAC               's  o_ner
14                           gold  o_ner
15                           dome  i_ner
16                             is  o_ner
17                              a  o_ner
18                         golden  o_ner
19                         statue  o_ner
20                             of  o_ner
21                            the  o_ner
22                         Virgin  o_ner
23              

In [105]:
df = pd.DataFrame(context_ents, context_iob_ner)
df

Unnamed: 0,0
o_ner,
o_ner,
o_ner,
o_ner,
o_ner,
o_ner,
b_ner,NORP
o_ner,
o_ner,
o_ner,


In [9]:
def index_answer(row):
    token_span = row[-4]
    starts, ends = zip(*token_span)
    answer_start = row[-2]
    answer_end = row[-1]
    try:
        return row[:-3] + (starts.index(answer_start), ends.index(answer_end))
    except ValueError:
        return row[:-3] + (None, None)

In [10]:
token_span = context_token_span

In [11]:
starts, ends = zip(*token_span)

In [12]:
answer = 'Saint Bernadette Soubirous'
answer_start = 515
answer_end = answer_start + len(answer)

In [13]:
starts.index(answer_start)

102

In [14]:
ends.index(answer_end)

104

In [15]:
# pos tagging count for context
counter_tag = collections.Counter(w for w in context_tags) #context_tags

In [16]:
counter_tag

Counter({"''": 1,
         ',': 6,
         '-LRB-': 1,
         '-RRB-': 1,
         '.': 7,
         'CC': 4,
         'CD': 2,
         'DT': 22,
         'IN': 20,
         'JJ': 7,
         'NN': 20,
         'NNP': 27,
         'NNS': 2,
         'POS': 1,
         'PRP': 3,
         'RB': 5,
         'VBD': 1,
         'VBG': 1,
         'VBZ': 8,
         'WDT': 1,
         'WRB': 1,
         '``': 1})

In [17]:
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True)

In [18]:
tag2id = {w: i for i, w in enumerate(vocab_tag)}

In [19]:
# largest count with small index number
tag2id

{"''": 16,
 ',': 7,
 '-LRB-': 19,
 '-RRB-': 21,
 '.': 6,
 'CC': 9,
 'CD': 12,
 'DT': 1,
 'IN': 3,
 'JJ': 5,
 'NN': 2,
 'NNP': 0,
 'NNS': 11,
 'POS': 13,
 'PRP': 10,
 'RB': 8,
 'VBD': 18,
 'VBG': 14,
 'VBZ': 4,
 'WDT': 20,
 'WRB': 17,
 '``': 15}

In [143]:
import msgpack
with open('SQuAD/meta.msgpack', 'rb') as f:
    meta = msgpack.load(f, encoding='utf8')

In [147]:
import torch
embedding = torch.Tensor(meta['char_embedding'])

In [148]:
embedding.size()

torch.Size([87603, 100])

In [152]:
meta['vocab_part_ner']

['o_ner', 'i_ner']

In [26]:
# number of pos tag given by spacy 
len(meta['vocab_tag'])

50

In [27]:
# number of NER tag given by spacy
len(meta['vocab_ent'])

19

In [190]:
# pos tag
#meta['vocab_tag']

In [191]:
#meta['vocab_ent']

In [33]:
# context
data['train'][0][6]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [194]:
# for example cotext id 
#data['train'][0][6]

In [193]:
# for example pos tag_id
import pandas as pd
df = pd.DataFrame(data['train'][0][3], data['train'][0][5])
#df

In [36]:
meta['vocab'][53946]

'Architecturally'

In [37]:
meta['vocab_tag'][10]

'RB'

In [192]:
#meta['embedding'][10]

In [422]:
 len(meta['vocab']) == len(meta['char_embeddings']) ==91187

True

### Character level embedding

In [None]:
pwd

In [None]:
size = 874474
d_emb = 100
seen = set()
fin_name = 'char/charNgram.txt'
with open(fin_name, 'r') as ftxt:
    content = ftxt.read()
    lines = content.splitlines()
    batch = []

In [None]:
for line in lines:
    elems = line.rstrip().split()
    vec = [float(n) for n in elems[-d_emb:]]
    word = ' '.join(elems[:-d_emb])
    if word in seen:
        continue
    seen.add(word)
    batch.append((word, vec))

In [None]:
def ngrams(sentence, n):
    """
    Returns:
        list: a list of lists of words corresponding to the ngrams in the sentence.
    """
    return [sentence[i:i+n] for i in range(len(sentence)-n+1)]

In [None]:
def emb(w, default='zero'):
    assert default == 'zero', 'only zero default is supported for character embeddings'
    chars = ['#BEGIN#'] + list(w) + ['#END#']
    embs = np.zeros(d_emb, dtype=np.float32)
    match = {}
    for i in [2, 3, 4]:
        grams = ngrams(chars, i)
        for g in grams:
            g = '{}gram-{}'.format(i, ''.join(g))
            e = self.lookup(g)
            if e is not None:
                match[g] = np.array(e, np.float32)
    if match:
        embs = sum(match.values()) / len(match)
    return embs.tolist()

In [None]:
chars = ['#BEGIN#'] + list('cat') + ['#END#']

In [None]:
chars

In [None]:
embs = np.zeros(d_emb, dtype=np.float32)
embs

In [None]:
match = {}
for i in [2, 3, 4]:
    grams = ngrams(chars, i)
    for g in grams:
        g = '{}gram-{}'.format(i, ''.join(g))
        print(g)
        #e = lookup(g)
        #if e is not None:
        #    match[g] = np.array(e, np.float32)

# prepro.py

In [171]:
trn_file = 'SQuAD/train-v1.1.json'
import json

def flatten_json(data_file, mode):
    """Flatten each article in training data."""
    with open(data_file) as f:
        data = json.load(f)['data']
    rows = []
    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                id_, question, answers = qa['id'], qa['question'], qa['answers']
                if mode == 'train':
                    answer = answers[0]['text']  # in training data there's only one answer
                    answer_start = answers[0]['answer_start'] # char level length
                    answer_end = answer_start + len(answer) # char level lenght
                    rows.append((id_, context, question, answer, answer_start, answer_end))
                else:  # mode == 'dev'
                    answers = [a['text'] for a in answers]
                    rows.append((id_, context, question, answers))
    return rows


train = flatten_json(trn_file, 'train')

In [172]:
train[8]

('5733bf84d058e614000b61bd',
 "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to 

In [137]:
wv_vocab = set()
with open('glove/glove.840B.300d.txt') as f:
    for line in f:
        token = normalize_text(line.rstrip().split(' ')[0])
        wv_vocab.add(token)

In [161]:
len(wv_vocab)

2195960

In [171]:
def build_vocab(questions, contexts):
    """
    Build vocabulary sorted by global word frequency, or consider frequencies in questions first,
    which is controlled by `args.sort_all`.
    """
    if True:
        counter = collections.Counter(w for doc in questions + contexts for w in doc)
        vocab = sorted([t for t in counter if t in wv_vocab], key=counter.get, reverse=True)
    else:
        counter_q = collections.Counter(w for doc in questions for w in doc)
        counter_c = collections.Counter(w for doc in contexts for w in doc)
        counter = counter_c + counter_q
        vocab = sorted([t for t in counter_q if t in wv_vocab], key=counter_q.get, reverse=True)
        vocab += sorted([t for t in counter_c.keys() - counter_q.keys() if t in wv_vocab],
                        key=counter.get, reverse=True)
    total = sum(counter.values())
    matched = sum(counter[t] for t in vocab)
    vocab.insert(0, "<PAD>") # in question_id and context_id, the 0 means padding
    vocab.insert(1, "<UNK>")
    return vocab, counter

In [231]:
counter = collections.Counter(w for doc in [row[5]] + [row[1]] for w in doc)

In [232]:
vocab = sorted([t for t in counter if t in wv_vocab], key=counter.get, reverse=True)

In [240]:
total = sum(counter.values())
matched = sum(counter[t] for t in vocab)
matched == total

True

In [209]:
row = train
full = train

In [359]:
vocab, counter = build_vocab([row[5]], [row[1]])
counter_tag = collections.Counter(w for w in row[3]) #context_tags
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True) # high rank with larger count
counter_ent = collections.Counter(w for w in row[4])
vocab_ent = sorted(counter_ent, key=counter_ent.get, reverse=True)
w2id = {w: i for i, w in enumerate(vocab)}
tag2id = {w: i for i, w in enumerate(vocab_tag)} # larger count(hight rank) with small index
ent2id = {w: i for i, w in enumerate(vocab_ent)}

In [361]:
len(vocab)

87

In [362]:
def to_id(row, unk_id=1):
    context_tokens = row[1]
    context_features = row[2]
    context_tags = row[3]
    context_ents = row[4]
    question_tokens = row[5]
    question_ids = [w2id[w] if w in w2id else unk_id for w in question_tokens]
    context_ids = [w2id[w] if w in w2id else unk_id for w in context_tokens]
    tag_ids = [tag2id[w] for w in context_tags]
    ent_ids = [ent2id[w] for w in context_ents]
    return (row[0], context_ids, context_features, tag_ids, ent_ids, question_ids) + row[6:]

In [266]:
train_ann_id = to_id(train, unk_id=1)

In [None]:
vocab_size = len(vocab)
embeddings = np.zeros((vocab_size, 300))
embed_counts = np.zeros(vocab_size)
embed_counts[:2] = 1  # PADDING & UNK
wv_file = 'glove/glove.840B.300d.txt'
with open(wv_file) as f:
    for line in f:
        elems = line.rstrip().split(' ')
        token = normalize_text(elems[0])
        if token in w2id:
            word_id = w2id[token]
            embed_counts[word_id] += 1
            embeddings[word_id] += [float(v) for v in elems[1:]]
embeddings /= embed_counts.reshape((-1, 1))

In [275]:
print(len(embeddings))
print(len(embeddings[1]))

87
300


## add char embedding 

In [315]:
def ngrams(sentence, n):
    """
    Returns:
        list: a list of lists of words corresponding to the ngrams in the sentence.
    """
    return [sentence[i:i+n] for i in range(len(sentence)-n+1)]

In [349]:
from embeddings.embedding import Embedding
class CharEmbedding(Embedding):

    size = 874474
    d_emb = 100

    def __init__(self):

        self.db = self.initialize_db(self.path('char/kazuma.db'))
        if len(self) < self.size:
            self.clear()
            self.load_word2emb()
            

    def emb(self, w, default='zero'):
        assert default == 'zero', 'only zero default is supported for character embeddings'
        chars = ['#BEGIN#'] + list(w) + ['#END#']
        embs = np.zeros(self.d_emb, dtype=np.float32)
        match = {}
        for i in [2, 3, 4]:
            grams = ngrams(chars, i)
            for g in grams:
                g = '{}gram-{}'.format(i, ''.join(g))
                e = self.lookup(g)
                if e is not None:
                    match[g] = np.array(e, np.float32)
        if match:
            embs = sum(match.values()) / len(match)
        return embs.tolist()

    def load_word2emb(self, batch_size=1000):
        seen = set()
        fin_name = 'char/charNgram.txt'
        with open(fin_name, 'r') as ftxt:
            content = ftxt.read()
            lines = content.splitlines()
            batch = []
            for line in lines:
                elems = line.rstrip().split()
                vec = [float(n) for n in elems[-d_emb:]]
                word = ' '.join(elems[:-d_emb])
                if word in seen:
                    continue
                seen.add(word)                
                batch.append((word, vec))
                if len(batch) == batch_size:
                    self.insert_batch(batch)
                    batch.clear()
            if batch:
                self.insert_batch(batch)
charembedding = CharEmbedding()

In [366]:
vocab_size = len(vocab)
char_embeddings = np.zeros((vocab_size, 100))
char_embed_counts = np.zeros(vocab_size)
char_embed_counts[:2] = 1  # PADDING & UNK
for token in w2id:
    word_id = w2id[token]
    char_embed_counts[word_id] += 1
    char_embeddings[word_id] += charembedding.emb(token) 
char_embeddings /= char_embed_counts.reshape((-1, 1))    

In [445]:
len(char_embeddings)

87

In [446]:
len(embeddings)

87

In [452]:
glove_char_embedding = np.concatenate((embeddings, char_embeddings), axis=1)

In [454]:
len(glove_char_embedding[1])

400

In [455]:
aa = glove_char_embedding.tolist()

In [139]:
a = np.array([[1, 2, 1], [3, 4, 1]])
b = np.array([[5, 6], [1,2]])
np.concatenate((a, b), axis=1)

array([[1, 2, 1, 5, 6],
       [3, 4, 1, 1, 2]])

In [142]:
a

array([[1, 2, 1],
       [3, 4, 1]])

### Test new added feature

In [200]:
def clean_spaces(text):
    """normalize spaces in a string."""
    text = re.sub(r'\s', ' ', text)
    return text

def normalize_text(text):
    return unicodedata.normalize('NFD', text)

nlp = None

def init():
    """initialize spacy in each process"""
    '''
    'en': Noun chunks are "base noun phrases" – flat phrases that have a noun as their head.
    parser=False or disable=['parser'] : don't need any of the syntactic information,
                                        and will make spaCy load and run much faster.
    '''
    global nlp
    nlp = spacy.load('en', parser=False)

def annotate(row):
    global nlp
    id_, context, question = row[:3]
    q_doc = nlp(clean_spaces(question))
    c_doc = nlp(clean_spaces(context))
    question_tokens = [normalize_text(w.text) for w in q_doc]
    context_tokens = [normalize_text(w.text) for w in c_doc]
    question_tokens_lower = [w.lower() for w in question_tokens]
    context_tokens_lower = [w.lower() for w in context_tokens]
    context_token_span = [(w.idx, w.idx + len(w.text)) for w in c_doc] # the lenghth of each tokens
    context_tags = [w.tag_ for w in c_doc] # POS tagging
    context_ents = [w.ent_type_ for w in c_doc] # NER tagging

    question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in q_doc}
    # PRON is such as me/it/you
    # lemma_ : cats -> cat

    question_tokens_set = set(question_tokens)
    question_tokens_lower_set = set(question_tokens_lower)
    match_origin = [w in question_tokens_set for w in context_tokens]
    match_lower = [w in question_tokens_lower_set for w in context_tokens_lower]
    match_lemma = [(w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma for w in c_doc]
    # term frequency in document
    counter_ = collections.Counter(context_tokens_lower)
    total = len(context_tokens_lower)
    # frequent feature
    context_tf = [counter_[w] / total for w in context_tokens_lower]
    # exact match feature refering to the paper
    context_features = list(zip(match_origin, match_lower, match_lemma, context_tf))
    if not True:
        context_tokens = context_tokens_lower
        question_tokens = question_tokens_lower
    return (id_, context_tokens, context_features, context_tags, context_ents,
            question_tokens, context, context_token_span) + row[3:]
def index_answer(row):
    token_span = row[-4] #context_token_span
    starts, ends = zip(*token_span)
    answer_start = row[-2]
    answer_end = row[-1]
    try:
        return row[:-3] + (starts.index(answer_start), ends.index(answer_end))
    except ValueError:
        return row[:-3] + (None, None)
nlp = spacy.load('en', parser=False)
train_ann = annotate(train[8])

In [257]:
train_ann[5]

['How',
 'many',
 'student',
 'news',
 'papers',
 'are',
 'found',
 'at',
 'Notre',
 'Dame',
 '?']

In [258]:
with open('SQuAD/data.msgpack', 'rb') as f:
    data = msgpack.load(f, encoding='utf8')

In [259]:
# train: id, context_id, context_features, tag_id, ent_id, iob_np, iob_ner, part_ner, q tag_id, q ent_id, q iob_np, q iob_ner,
#        question_id, context, context_token_span, answer_start, answer_end
import pandas as pd
df = pd.DataFrame(np.column_stack([data['train'][8][6],data['train'][8][7], train_ann[1]]))
print(df.to_string())

     0  1              2
0    0  0             As
1    0  0             at
2    0  0           most
3    0  0          other
4    0  0   universities
5    0  0              ,
6    1  1          Notre
7    2  1           Dame
8    2  0             's
9    0  0       students
10   0  0            run
11   0  0              a
12   0  0         number
13   0  0             of
14   0  0           news
15   0  0          media
16   0  0        outlets
17   0  0              .
18   0  0            The
19   1  1           nine
20   0  0        student
21   0  0              -
22   0  0            run
23   0  0        outlets
24   0  0        include
25   1  1          three
26   0  0     newspapers
27   0  0              ,
28   0  0           both
29   0  0              a
30   0  0          radio
31   0  0            and
32   0  0     television
33   0  0        station
34   0  0              ,
35   0  0            and
36   0  0        several
37   0  0      magazines
38   0  0            and


In [260]:
import pandas as pd
df = pd.DataFrame(np.column_stack([data['train'][8][8],data['train'][8][9], data['train'][8][10],data['train'][8][11],train_ann[5]]))
print(df.to_string())

     0  1  2  3        4
0   12  0  0  0      How
1    5  0  0  0     many
2    0  0  1  0  student
3    0  0  2  0     news
4    8  0  2  0   papers
5   13  0  0  0      are
6   10  0  0  0    found
7    1  0  0  0       at
8    2  2  1  1    Notre
9    2  2  2  2     Dame
10   3  0  0  0        ?


In [254]:
with open('SQuAD/meta.msgpack', 'rb') as f:
    meta = msgpack.load(f, encoding='utf8')

In [256]:
len(meta['vocab_q_tag'])

50

In [278]:
a = torch.randn(4,4)
ee = torch.randn(4,6)
b = torch.randn(4,5)
c = torch.randn(4,2)

In [279]:
list_ = [a,ee]
list_.append(b)
list_.append(c)

In [280]:
list_

[
 -1.2702 -1.8569  0.4600  0.1064
  0.4058  0.1211 -1.3250  0.2720
  0.2532  1.7568 -0.8371  0.2591
 -0.4002  1.0083 -0.9354 -0.4041
 [torch.FloatTensor of size 4x4], 
 -0.9446 -0.4591  1.5962  1.0960 -0.4257  0.3042
 -0.7830 -0.7529  0.6747  0.9685 -1.6344 -0.3820
 -0.5750 -0.3853 -1.4120  2.1387  0.3732  0.9374
 -2.4007  1.2158 -0.8062 -0.1280 -0.6776 -0.5533
 [torch.FloatTensor of size 4x6], 
 -0.1084  1.8311 -1.9461 -0.3505 -0.5678
 -0.3808 -1.8617  1.3735  0.9692  0.3530
  0.2152 -0.6362  0.4172 -1.3567  1.0117
 -0.8054  0.6637  0.5527  0.5326 -1.5344
 [torch.FloatTensor of size 4x5], 
 -0.9352 -0.1562
 -0.6677 -0.1769
  0.5419 -1.4153
 -0.6163 -0.2632
 [torch.FloatTensor of size 4x2]]

In [281]:
drnn_input = torch.cat(list_, 1)

In [282]:
drnn_input.size()

torch.Size([4, 17])

In [274]:
print('x2_pos is {}'.format(drnn_input.size()))

x2_pos is torch.Size([4, 11])


In [332]:
ex = [1,1,1,1,None, None,1,1,1,1,1]

In [339]:
input = []

In [343]:
input = [2*e for e in ex[0:4]]

In [345]:
input+ ex[4:6] + [2*e for e in ex[-5:]]

[2, 2, 2, 2, None, None, 2, 2, 2, 2, 2]

In [364]:
ex

[1, 1, 1, 1, None, None, 1, 1, 1, 1, 1]

In [380]:
empty = torch.FloatTensor()
len(empty) ==0

True

In [381]:
from torch.autograd import Variable
[Variable(drnn_input)] + [Variable(empty)] + [2,3,4]

[Variable containing:
 
 Columns 0 to 9 
 -1.2702 -1.8569  0.4600  0.1064 -0.9446 -0.4591  1.5962  1.0960 -0.4257  0.3042
  0.4058  0.1211 -1.3250  0.2720 -0.7830 -0.7529  0.6747  0.9685 -1.6344 -0.3820
  0.2532  1.7568 -0.8371  0.2591 -0.5750 -0.3853 -1.4120  2.1387  0.3732  0.9374
 -0.4002  1.0083 -0.9354 -0.4041 -2.4007  1.2158 -0.8062 -0.1280 -0.6776 -0.5533
 
 Columns 10 to 16 
 -0.1084  1.8311 -1.9461 -0.3505 -0.5678 -0.9352 -0.1562
 -0.3808 -1.8617  1.3735  0.9692  0.3530 -0.6677 -0.1769
  0.2152 -0.6362  0.4172 -1.3567  1.0117  0.5419 -1.4153
 -0.8054  0.6637  0.5527  0.5326 -1.5344 -0.6163 -0.2632
 [torch.FloatTensor of size 4x17],
 Variable containing:[torch.FloatTensor with no dimension],
 2,
 3,
 4]