In [1]:
import json
from tqdm import tqdm #progress bar
import spacy
import numpy as np
import random

nlp = spacy.blank("en")

def word_tokenize(input_): #returns token version of input
    input_nlp = nlp(input_)
    return [token.text for token in input_nlp]

def convert_idx(text, tokens): #returns spans for tokens within text
    current = 0 #serves as a cursor so that the analysis doesn't go backward in text and saves times
    #but what happens if you jump over a token? /!\
    spans = [] #list of spans (start, end)
    for token in tokens:
        current = text.find(token, current) #start index 
        if current < 0: #find returns - if not found
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token) #the following search starts at the end of the current found token
    return spans

def process_file(filename, data_type, word_counter, char_counter):
    print("Generating {} examples...".format(data_type)) # data type is either train or dev
    examples = [] #list with all the preprocessed SQUAD data about context and questions
    eval_examples = {} #dict with answers
    total = 0 #total number of questions over all articles
    with open(filename, "r") as fh: #filname is SQUAD db
        source = json.load(fh)
        #explores Json structure to preprocess:
        #(data(title, paragraph(context, qas(answers(answer_start, text), question, id))), version)
        for article in tqdm(source["data"]): #1st level
            
            for para in article["paragraphs"]: #2nd level
                #Each paragraph is a context and qas
                #Here, preprocessing the context
                context = para["context"].replace("''", '" ').replace("``", '" ') #syntaxic preprocessing
                context_tokens = word_tokenize(context) #list of all tokens from context
                context_chars = [list(token) for token in context_tokens] #list of list of chars in context
                spans = convert_idx(context, context_tokens) #list of spans for all tokens in context
                for token in context_tokens:
                    word_counter[token] += len(para["qas"]) #collections.Counter() occurence
                    #for each token in context, adds the total number of qas it is related to
                    for char in token:
                        char_counter[char] += len(para["qas"]) #same over characters
                        
                #preprocessing qas, for each context several q-a pairs
                for qa in para["qas"]:
                    if total >10: #added to limit total size
                        break
                    total += 1 #adding one to the total question count
                    ques = qa["question"].replace("''", '" ').replace("``", '" ') #syntaxic
                    ques_tokens = word_tokenize(ques) #list, tokenized questions
                    ques_chars = [list(token) for token in ques_tokens] #list of list of char in context
                    for token in ques_tokens: #for each word in question
                        word_counter[token] += 1 #the word in the question is related to the question, so add 1
                        for char in token:
                            char_counter[char] += 1 #same here
                    y1s, y2s = [], [] #lisf of indices
                    answer_texts = [] #list of all texts
                    #for each answer now
                    for answer in qa["answers"]:
                        answer_text = answer["text"]
                        answer_start = answer['answer_start']
                        answer_end = answer_start + len(answer_text)
                        answer_texts.append(answer_text)
                        answer_span = [] #list of all spans' idx with start and end computed above
                        #for each span now, which account for all tokens in context
                        for idx, span in enumerate(spans):
                            if not (answer_end <= span[0] or answer_start >= span[1]): #if the token is in the answer
                                answer_span.append(idx)
                        #for each answer, store first span idx and last span idx in y1 and y2
                        y1, y2 = answer_span[0], answer_span[-1]
                        y1s.append(y1) #list of indices of spans of words in context that is the first also in the answer
                        y2s.append(y2)
                    #end of context and question preprocessing, all is stored in example
                    example = {"context_tokens": context_tokens, "context_chars": context_chars,
                               "ques_tokens": ques_tokens,
                               "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total}
                    examples.append(example) #store all examples in a list
                    #store each question info in a dict that identifies them by them number aka total
                    eval_examples[str(total)] = {
                        "context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]} #concatene les traitements
        ########################
        random.shuffle(examples)
        ######################## Why? Does it improve perf? To test!
        print("{} questions in total".format(len(examples)))
    return examples, eval_examples

def get_embedding(counter, data_type, limit=-1, emb_file=None, size=None, vec_size=None):
    print("Generating {} embedding...".format(data_type))
    embedding_dict = {} #dict of all embeddings to speed up process
    filtered_elements = [k for k, v in counter.items() if v > limit]
    #list of words for which the "question related" count is above limit
    if emb_file is not None: #if glove has been provided: for words embedding
        with open(emb_file, "r", encoding="utf-8") as fh: #this is where they use glove
            for line in tqdm(fh, total=size):#for each line in glove, which accounts for a word and its embedding
                array = line.split() #line is a string, array is a list of all elements
                word = "".join(array[0:-vec_size]) #word
                vector = list(map(float, array[-vec_size:])) #embeddings vector
                if word in counter and counter[word] > limit: #this is altready tested in filtered_elements
                    embedding_dict[word] = vector #if word form glove is in the context, then store it in embeddings dict
        print("{} / {} tokens have corresponding {} embedding vector".format(
            len(embedding_dict), len(filtered_elements), data_type))
    else:
        assert vec_size is not None
        for token in filtered_elements: #all the other elements
            embedding_dict[token] = [np.random.normal(scale=0.1) for _ in range(vec_size)]
            #embedding vector is randomly generated
        print("{} tokens have corresponding embedding vector".format(
            len(filtered_elements)))

    NULL = "--NULL--"
    OOV = "--OOV--"
    #dict with token and its position in embedding dict
    token2idx_dict = {token: idx for idx, token in enumerate(embedding_dict.keys(), 1)}
    #initiate idx2token_dict
    idx2token_dict={}
    idx2token_dict[0]=NULL
    idx2token_dict[len(embedding_dict)+1]=OOV
    for k in token2idx_dict:
        idx2token_dict[token2idx_dict[k]]=k #reverse token2idx_dict
    #complete token2idx
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = len(embedding_dict)+1
    #initiate embedding_dict
    embedding_dict[NULL] = [0. for _ in range(vec_size)] #for NULL word, the embedding is empty
    embedding_dict[OOV] = np.random.random((1,vec_size))/2-0.25 #where do these figures come from?
    #create idx2emb_dict with idx and embeddings vector
    idx2emb_dict = {idx: embedding_dict[token] for token, idx in token2idx_dict.items()}
    #emb_mat is a matrix of all embeddings for all indices
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
    return emb_mat, token2idx_dict, idx2token_dict

In [2]:
from collections import Counter #better than pure dict
import numpy as np

word_counter, char_counter = Counter(), Counter()

#they all keep the same counters
train_examples, train_eval = process_file('../../Database/train-v2.0.json', "train", word_counter, char_counter)
dev_examples, dev_eval = process_file('../../Database/dev-v2.0.json', "dev", word_counter, char_counter)
test_examples, test_eval = process_file('../../Database/dev-v2.0.json', "dev", word_counter, char_counter)

Generating train examples...


100%|██████████| 442/442 [00:42<00:00, 10.42it/s]
  3%|▎         | 1/35 [00:00<00:03,  9.37it/s]

11 questions in total
Generating dev examples...


100%|██████████| 35/35 [00:02<00:00, 12.55it/s]
  0%|          | 0/35 [00:00<?, ?it/s]

11 questions in total
Generating dev examples...


100%|██████████| 35/35 [00:02<00:00, 12.51it/s]

11 questions in total





In [7]:
len(dev_eval)

11

In [8]:
# save train_eval and dev_eval, might be used to save RAM!
with open('dataset/train_eval.json', "w") as fh:
    json.dump(train_eval, fh)
with open('dataset/dev_eval.json','w') as fh:
    json.dump(dev_eval,fh)
with open('dataset/test_eval.json','w') as fh:
    json.dump(test_eval,fh)

In [9]:
glove_path = "/home/unchartech001/Local_Resources/glove.6B/glove.6B.300d.txt"
word_emb_mat, word2idx_dict,id2word_dict = get_embedding(
    word_counter, "word", emb_file=glove_path, size=int(2.2e6), vec_size=300)
char_emb_mat, char2idx_dict, id2char_dict = get_embedding(
    char_counter, "char", emb_file=None, size=None, vec_size=200)

  0%|          | 1203/2200000 [00:00<03:02, 12024.48it/s]

Generating word embedding...


 18%|█▊        | 400001/2200000 [00:26<02:00, 14905.20it/s]


43032 / 103636 tokens have corresponding word embedding vector
Generating char embedding...
1417 tokens have corresponding embedding vector


In [10]:
import pandas as pd
df_id2word = []
for k in id2word_dict:
    df_id2word.append([k, id2word_dict[k]]) #first save in a list all pairs of items and indices
df_id2word = pd.DataFrame(df_id2word) #then into dataframe
df_id2word.to_csv('id2word.csv',index=None)

In [11]:
word_size = len(word_emb_mat) #length of embedding matrices
char_input_size = len(char_emb_mat)-1 #idem characters
print(word_size)
print(char_input_size)
word_mat = np.zeros((len(word_emb_mat),len(word_emb_mat[0])))
for i, w in enumerate(word_emb_mat):
    word_mat[i,:] = w
print(word_mat.shape)
np.save('word_emb_mat2.npy', word_mat) #saved as a numpy array and replicates word_emb_mat

43034
1418
(43034, 300)


In [12]:
print(char2idx_dict)
sorted(char_counter.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
print(char_counter)

{'B': 1, 'e': 2, 'y': 3, 'o': 4, 'n': 5, 'c': 6, 'é': 7, 'G': 8, 'i': 9, 's': 10, 'l': 11, 'K': 12, 'w': 13, '-': 14, 'C': 15, 'a': 16, 'r': 17, 't': 18, '(': 19, '/': 20, 'b': 21, 'ː': 22, 'ˈ': 23, 'j': 24, 'ɒ': 25, 'ɪ': 26, 'Y': 27, 'O': 28, 'N': 29, ')': 30, 'S': 31, 'p': 32, 'm': 33, '4': 34, ',': 35, '1': 36, '9': 37, '8': 38, 'A': 39, 'g': 40, 'd': 41, 'u': 42, '.': 43, 'H': 44, 'T': 45, 'x': 46, 'h': 47, 'f': 48, 'v': 49, '0': 50, 'R': 51, '&': 52, 'D': 53, "'": 54, 'M': 55, 'L': 56, '2': 57, '3': 58, '"': 59, 'z': 60, 'W': 61, '?': 62, 'I': 63, ' ': 64, 'k': 65, 'F': 66, 'J': 67, '5': 68, '6': 69, 'à': 70, 'V': 71, 'P': 72, 'Z': 73, 'E': 74, ';': 75, 'q': 76, '7': 77, 'X': 78, 'U': 79, ':': 80, '$': 81, '[': 82, ']': 83, '—': 84, 'Q': 85, '#': 86, '–': 87, '%': 88, 'è': 89, 'ç': 90, 'ʃ': 91, 'ʊ': 92, 'æ': 93, '\u200b': 94, 'ʁ': 95, 'ɑ': 96, '̃': 97, 'ɔ': 98, 'ɛ': 99, 'ń': 100, 'Ż': 101, 'ż': 102, 'ó': 103, 'ü': 104, 'ś': 105, 'ł': 106, 'ò': 107, 'É': 108, '!': 109, 'ô': 110, '£

TypeError: must use keyword argument for key function

In [13]:
import pandas as pd
def get_indexs(exa, word2idx_dict, char2idx_dict, cont_limit=400, ques_limit=50, ans_limit=30, char_limit=16):
    n = len(exa) #total number of questions, >130k if all
    miss_word = 0
    miss_char = 0
    overlimit = 0
    #outputs are:
    cont_index = np.zeros((n, cont_limit)) 
    ques_index = np.zeros((n, ques_limit))
    cont_char_index = np.zeros((n, cont_limit, char_limit))
    ques_char_index = np.zeros((n, ques_limit, char_limit))
    cont_len = np.zeros((n, 1))
    ques_len = np.zeros((n, 1))
    y_start = np.zeros((n, cont_limit))
    y_end = np.zeros((n, cont_limit))
    qid = np.zeros((n))
    
    
    #contexte
    for i in tqdm(range(n-1)):
        qid[i] = int(exa[i]['id'])
        
        contexts = exa[i]['context_tokens']
        cont_len[i,0] = min(cont_limit, len(contexts))
        for j,c in enumerate(contexts):
            if j >= cont_limit:
                break
            if c in word2idx_dict:
                cont_index[i,j] = word2idx_dict[c]
            else:
                miss_word += 1
                cont_index[i,j] = word2idx_dict['--OOV--']
        contexts_char = exa[i]['context_chars']
        for j,c in enumerate(contexts_char):
            if j >= cont_limit:
                break
            for j2,c2 in enumerate(c):
                if j2 >= char_limit:
                    break
                if c2 in char2idx_dict:
                    cont_char_index[i,j,j2] = char2idx_dict[c2]
                else:
                    miss_char += 1
                    cont_char_index[i,j,j2] = char2idx_dict['--OOV--']
        #answer
        try:
            st = exa[i]['y1s'][0]
            ed = exa[i]['y2s'][0]
            if st < cont_limit:
                y_start[i, st] = 1
            if ed < cont_limit:
                if ed-st > ans_limit:
                    y_end[i, st + ans_limit] = 1
                    overlimit += 1
                else:
                    y_end[i, ed] = 1
        except:
            pass
        
        #question
        contexts = exa[i]['ques_tokens']
        ques_len[i, 0] = min(ques_limit, len(contexts))
        for j,c in enumerate(contexts):
            if j >= ques_limit:
                break
            if c in word2idx_dict:
                ques_index[i, j] = word2idx_dict[c]
            else:
                miss_word += 1
                ques_index[i, j] = word2idx_dict['--OOV--']
        contexts_char = exa[i]['ques_chars']
        for j,c in enumerate(contexts_char):
            if j >= ques_limit:
                break
            for j2,c2 in enumerate(c):
                if j2 >= char_limit:
                    break
                if c2 in char2idx_dict:
                    ques_char_index[i, j, j2] = char2idx_dict[c2]
                else:
                    miss_char += 1
                    ques_char_index[i,j,j2] = char2idx_dict['--OOV--']
    print('miss word:', miss_word)
    print('miss char:', miss_char)
    print('over limit:', overlimit)
        
    return cont_index, ques_index, cont_char_index, ques_char_index, cont_len, ques_len, y_start, y_end, qid

In [14]:
#same reapeated 3 times for train dev and test
#1st get indices
#
contw_input, quesw_input, contc_input,\
quesc_input, cont_len, ques_len, y_start,\
y_end, qid = get_indexs(train_examples, word2idx_dict, char2idx_dict)

np.save('dataset/train_contw_input.npy',contw_input)
np.save('dataset/train_quesw_input.npy',quesw_input)
np.save('dataset/train_contc_input.npy',contc_input)
np.save('dataset/train_quesc_input.npy',quesc_input)
np.save('dataset/train_cont_len.npy',cont_len)
np.save('dataset/train_ques_len.npy',ques_len)
np.save('dataset/train_y_start.npy',y_start)
np.save('dataset/train_y_end.npy',y_end)
np.save('dataset/train_qid.npy',qid)

100%|██████████| 10/10 [00:00<00:00, 1664.21it/s]

miss word: 313
miss char: 0
over limit: 0





In [15]:
contw_input, quesw_input, contc_input, quesc_input, cont_len, ques_len, y_start, y_end, qid\
=get_indexs(dev_examples, word2idx_dict, char2idx_dict)

np.save('dataset/dev_contw_input.npy',contw_input)
np.save('dataset/dev_quesw_input.npy',quesw_input)
np.save('dataset/dev_contc_input.npy',contc_input)
np.save('dataset/dev_quesc_input.npy',quesc_input)
np.save('dataset/dev_cont_len.npy',cont_len)
np.save('dataset/dev_ques_len.npy',ques_len)
np.save('dataset/dev_y_start.npy',y_start)
np.save('dataset/dev_y_end.npy',y_end)
np.save('dataset/dev_qid.npy',qid)

100%|██████████| 10/10 [00:00<00:00, 1390.68it/s]

miss word: 399
miss char: 0
over limit: 0





In [16]:
contw_input, quesw_input, contc_input, quesc_input, cont_len, ques_len, y_start, y_end, qid\
=get_indexs(test_examples, word2idx_dict, char2idx_dict)

np.save('dataset/test_contw_input.npy',contw_input)
np.save('dataset/test_quesw_input.npy',quesw_input)
np.save('dataset/test_contc_input.npy',contc_input)
np.save('dataset/test_quesc_input.npy',quesc_input)
np.save('dataset/test_cont_len.npy',cont_len)
np.save('dataset/test_ques_len.npy',ques_len)
np.save('dataset/test_y_start.npy',y_start)
np.save('dataset/test_y_end.npy',y_end)
np.save('dataset/test_qid.npy',qid)

100%|██████████| 10/10 [00:00<00:00, 1414.61it/s]

miss word: 398
miss char: 0
over limit: 0



