In [1]:
import json
from tqdm import tqdm
import spacy
import numpy as np
import re
import unicodedata

DigitsMapper = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
                'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7','eight': '8', 'nine': '9', 'ten': '10'}

def normal_query(query, document):
    """ normalize digits
    """
    nq = []
    for w in query:
        if w in DigitsMapper and w not in document:
            if DigitsMapper[w] in document:
                w = DigitsMapper[w]
        nq.append(w)
    return nq


def normalize_text(text):
    return unicodedata.normalize('NFD', text)

def token_extend(reg_rules):
    return ' ' + reg_rules.group(0) + ' '

def reform_text(text):
    text = re.sub(u'-|¢|¥|€|£|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/', token_extend, text)
    text = text.strip(' \n')
    text = re.sub('\s+', ' ', text)
    return text

nlp = spacy.blank("en")

def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    return spans

def process_file(filename, data_type, word_counter, char_counter):
    print("Generating {} examples...".format(data_type))
    examples = []
    eval_examples = {}
    total = 0
    with open(filename, "r") as fh:
        source = json.load(fh)
        for article in tqdm(source["data"]):
            for para in article["paragraphs"]:
                context = para["context"].replace("''", '" ').replace("``", '" ')
                context_tokens = word_tokenize(reform_text(context))
                spans = convert_idx(context, context_tokens)
                context_tokens = [normalize_text(t) for t in context_tokens]
                context_chars = [list(token) for token in context_tokens]
                for token in context_tokens:
                    word_counter[token] += len(para["qas"])
                    for char in token:
                        char_counter[char] += len(para["qas"])
                for qa in para["qas"]:
                    total += 1
                    ques = qa["question"].replace("''", '" ').replace("``", '" ')
                    ques_tokens = word_tokenize(reform_text(ques))
                    ques_tokens = [normalize_text(t) for t in ques_tokens]
                    ques_tokens = normal_query(ques_tokens, context_tokens)
                    ques_chars = [list(token) for token in ques_tokens]
                    for token in ques_tokens:
                        word_counter[token] += 1
                        for char in token:
                            char_counter[char] += 1
                    y1s, y2s = [], []
                    answer_texts = []
                    for answer in qa["answers"]:
                        answer_text = answer["text"]
                        answer_start = answer['answer_start']
                        answer_end = answer_start + len(answer_text)
                        answer_texts.append(answer_text)
                        answer_span = []
                        for idx, span in enumerate(spans):
                            if not (answer_end <= span[0] or answer_start >= span[1]):
                                answer_span.append(idx)
                        y1, y2 = answer_span[0], answer_span[-1]
                        y1s.append(y1)
                        y2s.append(y2)
#                         print(answer_text,'###',context_tokens[y1:y2+1])
                    example = {"context_tokens": context_tokens, "context_chars": context_chars,
                               "ques_tokens": ques_tokens,
                               "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total}
                    examples.append(example)
                    eval_examples[str(total)] = {
                        "context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]}
        print("{} questions in total".format(len(examples)))
    return examples, eval_examples

def get_embedding(counter, data_type, limit=-1, emb_file=None, size=None, vec_size=None):
    print("Generating {} embedding...".format(data_type))
    embedding_dict = {}
    filtered_elements = [k for k, v in counter.items() if v > limit]
    if emb_file is not None:
        assert size is not None
        assert vec_size is not None
        with open(emb_file, "r", encoding="utf-8") as fh:
            for line in tqdm(fh, total=size):
                array = line.split()
                word = "".join(array[0:-vec_size])
                word = normalize_text(word)
                vector = list(map(float, array[-vec_size:]))
                if word in counter and counter[word] > limit:
                    embedding_dict[word] = vector
        print("{} / {} word tokens have corresponding {} embedding vector".format(
            len(embedding_dict), len(filtered_elements), data_type))
    else:
        assert vec_size is not None
        for token in filtered_elements:
            embedding_dict[token] = [np.random.normal(
                scale=0.1) for _ in range(vec_size)]
        print("{} char tokens have corresponding embedding vector".format(
            len(filtered_elements)))

    NULL = "--NULL--"
    OOV = "--OOV--"
    token2idx_dict = {token: idx for idx,
                      token in enumerate(embedding_dict.keys(), 2)}
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = 1
    embedding_dict[NULL] = [0. for _ in range(vec_size)]
    embedding_dict[OOV] = [0. for _ in range(vec_size)]
    idx2emb_dict = {idx: embedding_dict[token]
                    for token, idx in token2idx_dict.items()}
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
    return emb_mat, token2idx_dict, idx2emb_dict

In [2]:
from collections import Counter
import numpy as np
word_counter, char_counter = Counter(), Counter()
train_examples, train_eval = process_file('original_data/train-v1.1.json', "train", word_counter, char_counter)
test_examples, test_eval = process_file('original_data/dev-v1.1.json', "dev", word_counter, char_counter)

Generating train examples...


100%|██████████| 442/442 [01:12<00:00,  6.11it/s]


87599 questions in total
Generating dev examples...


100%|██████████| 48/48 [00:08<00:00,  5.48it/s]

10570 questions in total





In [3]:
# save train_eval and dev_eval
# with open('dataset/train_eval.json', "w") as fh:
#     json.dump(train_eval, fh)
with open('dataset/test_eval.json','w') as fh:
    json.dump(test_eval,fh)

In [4]:
word_emb_mat, word2idx_dict, id2word_dict = get_embedding(
    word_counter, "word", emb_file='original_data/glove.840B.300d.txt', size=int(2.2e6), vec_size=300)
char_emb_mat, char2idx_dict, id2char_dict = get_embedding(
        char_counter, "char", emb_file=None, size=None, vec_size=64)

  0%|          | 1236/2200000 [00:00<02:57, 12352.94it/s]

Generating word embedding...


100%|█████████▉| 2196017/2200000 [02:49<00:00, 12987.39it/s]


90978 / 105846 word tokens have corresponding word embedding vector
Generating char embedding...
1231 char tokens have corresponding embedding vector


In [5]:
word_mat=np.zeros((len(word_emb_mat),len(word_emb_mat[0])))
for i,w in enumerate(word_emb_mat):
    word_mat[i,:]=w
print('word_mat:', word_mat.shape)
np.save('dataset/word_emb_mat.npy',word_mat)

char_mat=np.zeros((len(char_emb_mat),len(char_emb_mat[0])))
for i,w in enumerate(char_emb_mat):
    char_mat[i,:]=w
print('char_mat:', char_mat.shape)
np.save('dataset/char_emb_mat.npy',char_mat)

word_mat: (90980, 300)
char_mat: (1233, 64)


In [6]:
import pickle
def build_features(config, examples, data_type, out_file, word2idx_dict, char2idx_dict, id2word_dict, \
                   is_test=False):

    para_limit = config['test_para_limit'] if is_test else config['para_limit']
    ques_limit = config['test_ques_limit'] if is_test else config['ques_limit']
    ans_limit = 100 if is_test else config['ans_limit']
    char_limit = config['char_limit']

    def filter_func(example, is_test=False):
        if len(example['y2s'])==0 or len(example['y1s'])==0:
            print(example)
        return len(example["context_tokens"]) > para_limit or \
               len(example["ques_tokens"]) > ques_limit or \
               (example["y2s"][0] - example["y1s"][0]) > ans_limit
    
    def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1

    def _get_char(char):
        if char in char2idx_dict:
            return char2idx_dict[char]
        return 1

    print("Processing {} examples...".format(data_type))
    total = 0
    total_ = 0
    context_idxss=[]
    ques_idxss=[]
    context_char_idxss=[]
    ques_char_idxss=[]
    context_strings=[]
    ques_strings=[]
    y1s=[]
    y2s=[]
    qids=[]
    unans=0
    for example in tqdm(examples):
        total_ += 1

        if filter_func(example, is_test):
            continue

        total += 1
        qids.append(int(example['id']))
        context_idxs = np.zeros([para_limit], dtype=np.int32)
        context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
        ques_idxs = np.zeros([ques_limit], dtype=np.int32)
        ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)
        y1 = np.zeros([para_limit], dtype=np.float32)
        y2 = np.zeros([para_limit], dtype=np.float32)
        
        cont_temp=[]
        ques_temp=[]
        for i, token in enumerate(example["context_tokens"]):
            context_idxs[i] = _get_word(token)
            cont_temp.append(token)

        for i, token in enumerate(example["ques_tokens"]):
            ques_idxs[i] = _get_word(token)
            ques_temp.append(token)

        for i, token in enumerate(example["context_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                context_char_idxs[i, j] = _get_char(char)

        for i, token in enumerate(example["ques_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                ques_char_idxs[i, j] = _get_char(char)

        start, end = example["y1s"][-1], example["y2s"][-1]
        y1[start], y2[end] = 1.0, 1.0
        context_idxss.append(np.expand_dims(context_idxs,axis=0))
        ques_idxss.append(np.expand_dims(ques_idxs,axis=0))
        context_char_idxss.append(np.expand_dims(context_char_idxs,axis=0))
        ques_char_idxss.append(np.expand_dims(ques_char_idxs,axis=0))
        y1s.append(np.expand_dims(y1,axis=0))
        y2s.append(np.expand_dims(y2,axis=0))
        context_strings.append(cont_temp)
        ques_strings.append(ques_temp)
        
    context_idxss=np.concatenate(context_idxss,axis=0)
    ques_idxss=np.concatenate(ques_idxss,axis=0)
    context_char_idxss=np.concatenate(context_char_idxss,axis=0)
    ques_char_idxss=np.concatenate(ques_char_idxss,axis=0)
    y1s=np.concatenate(y1s,axis=0)
    y2s=np.concatenate(y2s,axis=0)
    qids=np.array(qids)
    context_strings=np.array(context_strings)
    ques_strings=np.array(ques_strings)
    
    meta = {'qid':qids,
           'context_id':context_idxss,
           'question_id':ques_idxss,
           'context_char_id':context_char_idxss,
           'question_char_id':ques_char_idxss,
           'y_start':y1s,
           'y_end':y2s,
           'context_string':context_strings,
           'question_string':ques_strings}   
    
    with open(out_file+data_type+'_total_data.pkl','wb') as f:
        pickle.dump(meta, f)
    
    print("Built {} / {} instances of features in total".format(total, total_))
    print('unanswerable:',unans)

config={
    'test_para_limit':1000,
    'test_ques_limit':50,
    'para_limit':400,
    'ques_limit':50,
    'ans_limit':30,
    'char_limit':16,
}

build_features(config, train_examples, 'train', 'dataset/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)
build_features(config, test_examples, 'dev', 'dataset/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)

  0%|          | 158/87599 [00:00<00:55, 1574.00it/s]

Processing train examples...


100%|██████████| 87599/87599 [00:36<00:00, 2393.05it/s]
  3%|▎         | 268/10570 [00:00<00:03, 2661.45it/s]

Built 87341 / 87599 instances of features in total
unanswerable: 0
Processing dev examples...


100%|██████████| 10570/10570 [00:04<00:00, 2508.95it/s]


Built 10471 / 10570 instances of features in total
unanswerable: 0
