This code to prepare data for deep leaning.

Output format : ([question],[sentence_1],[sentence_2],..,[sentence_n],[sentence_1_yes/no],[sentence_2_yes/no],...,[sentence_3_yes/no])


In [15]:
#import libraries

import codecs
import numpy as np
import pandas as pd
import pickle
from random import randint
from numpy.linalg import norm

from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
# flags

to_lower = True
top_n = 3
glove_dim = 100

In [64]:
in_loc_train_article = "output/train_article.pkl"
in_loc_train_qas = "output/train_qas.pkl"

out_loc_qa_tuple = "output/version_3/train_data.pkl"

In [65]:
with open(in_loc_train_article,'rb') as file_:
    train_articles = pickle.load(file_)

with open(in_loc_train_qas,'rb') as file_:
    train_qas = pickle.load(file_)
    
train_articles = pd.DataFrame(train_articles)    
train_qas = pd.DataFrame(train_qas)

In [66]:
train_articles.head()

Unnamed: 0,context,paragraph_id,title,title_id
0,"Architecturally, the school has a Catholic cha...",0_0,University_of_Notre_Dame,0
1,"As at most other universities, Notre Dame's st...",0_1,University_of_Notre_Dame,0
2,The university is the major seat of the Congre...,0_2,University_of_Notre_Dame,0
3,The College of Engineering was established in ...,0_3,University_of_Notre_Dame,0
4,All of Notre Dame's undergraduate students are...,0_4,University_of_Notre_Dame,0


In [67]:
train_qas[:20]

Unnamed: 0,answer_start,answer_text,paragraph_id,question,question_id,title_id
0,515,Saint Bernadette Soubirous,0_0,To whom did the Virgin Mary allegedly appear i...,5733be284776f41900661182,0
1,188,a copper statue of Christ,0_0,What is in front of the Notre Dame Main Building?,5733be284776f4190066117f,0
2,279,the Main Building,0_0,The Basilica of the Sacred heart at Notre Dame...,5733be284776f41900661180,0
3,381,a Marian place of prayer and reflection,0_0,What is the Grotto at Notre Dame?,5733be284776f41900661181,0
4,92,a golden statue of the Virgin Mary,0_0,What sits on top of the Main Building at Notre...,5733be284776f4190066117e,0
5,248,September 1876,0_1,When did the Scholastic Magazine of Notre dame...,5733bf84d058e614000b61be,0
6,441,twice,0_1,How often is Notre Dame's the Juggler published?,5733bf84d058e614000b61bf,0
7,598,The Observer,0_1,What is the daily student paper at Notre Dame ...,5733bf84d058e614000b61c0,0
8,126,three,0_1,How many student news papers are found at Notr...,5733bf84d058e614000b61bd,0
9,908,1987,0_1,In what year did the student paper Common Sens...,5733bf84d058e614000b61c1,0


In [68]:
#get embedding matrix

def get_embedding_index():
    embeddings_index = {}
    f = codecs.open('glove/glove.txt',encoding = 'utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index

embeddings_index = get_embedding_index()

In [69]:
# preprocess data

from sklearn.feature_extraction import stop_words

stop = set(stop_words.ENGLISH_STOP_WORDS)


def clean_data(text, remove_stopwords= True):
    symbols = "— # $ % ^ & * ( ) + = - [ ] ; , . / { } | : < > ? ~ ; \" \' –".split()
    
    for symbol in symbols:
        text = text.replace(symbol," " + symbol + " ")
    
    words = text.split()
    new_words = []
    
    if (remove_stopwords):
        for word in words:
            if word.lower() not in stop:
                new_words.append(word)
        
        return ' '.join(new_words).lower()
    
    else:
        return ' '.join(words).lower()

In [70]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_distance(w1,w2):
    return cosine_similarity(w1,w2)[0][0]

In [71]:
def vectors(sentence):
    
    #get weighted glove word vector
    
    vector = np.zeros((1,100))
    
    words = clean_data(sentence,remove_stopwords = True).lower().split(" ")
    
    count = 0
    for word in words:
        count += 1
        try:
            vector = vector +  embeddings_index[word]
        except:
            #print("Word %s    in found in glove vector representation. Please check" % word)
            continue
    
    vector = vector/count
    
    
    return vector

In [72]:
def sort_sentence(question,sentences,answer_sentence_idx):
    
    #based on cosine similarities, sentence are sorted
    
    top_n_sentences = []
    answer_idx = []
    
    dummy_sentene = ["this is a dummy sentence"]
    
    if len(sentences) < top_n:
        sentences = sentences + dummy_sentene * (top_n - len(sentences))
    
    cosine_values = [0]*len(sentences)
    
    for i,sentence in enumerate(sentences):
        cosine_values[i] = cosine_distance(vectors(question),vectors(sentence))
    
    for top_n_index in reversed([i for i in np.argsort(cosine_values)[-top_n:]]):
        top_n_sentences.append(clean_data(sentences[top_n_index],remove_stopwords= False))
        if answer_sentence_idx == top_n_index:
            answer_idx.append(1)
        else:
            answer_idx.append(0)
    
    return top_n_sentences + answer_idx

In [73]:
def get_answer_sentence(answer,answer_start,sentences):
    
    count = 0
    for i,sent in enumerate(sentences):
        count += len(sent) + 1
        if count > answer_start:
            return i,sent
    return i,sent

In [74]:
import random

def suffle_answer(temp_qa_pair):
    idx=list(range(1,4))
    random.shuffle(idx)
    
    new_temp_qa_pair = [temp_qa_pair[0] , temp_qa_pair[idx[0]] , temp_qa_pair[idx[1]] ,
                        temp_qa_pair[idx[2]] ,temp_qa_pair[idx[0] + top_n] ,
                        temp_qa_pair[idx[1] + top_n] ,  temp_qa_pair[idx[2]+ top_n]]
     
    return new_temp_qa_pair


In [75]:
def prepare_data():
    
    output_qa = []
    
    print("Total_question to Process... %s" %train_qas.shape[0])
    
    for i, row in train_qas.iterrows():
        question = row["question"]
        para_id = row["paragraph_id"]
        answer_start = row["answer_start"]
        answer_text = row["answer_text"]
        
        para = train_articles[train_articles["paragraph_id"] == para_id]["context"].tolist()[0]
        
        sentences = sent_tokenize(para)
        
        answer_sentence_idx,answer_sentence = get_answer_sentence(answer_text,answer_start,sentences)
        
        top_n_sentences = sort_sentence(question,sentences,answer_sentence_idx)
        
        temp_qa_pair = [clean_data(question,remove_stopwords= False)] + top_n_sentences
        
        
        answer_flag = 0
        for j,sentence in enumerate(sentences):
            for n in range(top_n):
                if answer_text in temp_qa_pair[n+1]:
                    temp_qa_pair[top_n+n+1] = 1
                    answer_flag = 1
                    break
                    
            if answer_flag == 1:
                break
        
        temp_qa_pair = suffle_answer(temp_qa_pair)
        output_qa.append(tuple(temp_qa_pair))
        
        if i % 1000 == 0:
            print("procesed %i questions"% i)
            
        if i == 1:
            print(temp_qa_pair)
        
        
    with open(out_loc_qa_tuple,'wb') as file_:
        pickle.dump(output_qa,file_)
        
    return output_qa
       

In [76]:
check = prepare_data()

Total_question to Process... 87599
procesed 0 questions
['what is in front of the notre dame main building ?', 'architecturally , the school has a catholic character .', 'next to the main building is the basilica of the sacred heart .', 'at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary .', 0, 0, 0]
procesed 1000 questions
procesed 2000 questions
procesed 3000 questions
procesed 4000 questions
procesed 5000 questions
procesed 6000 questions
procesed 7000 questions
procesed 8000 questions
procesed 9000 questions
procesed 10000 questions
procesed 11000 questions
procesed 12000 questions
procesed 13000 questions
procesed 14000 questions
procesed 15000 questions
procesed 16000 questions
procesed 17000 questions
procesed 18000 questions
procesed 19000 questions
procesed 20000 questions
procesed 21000 questions
procesed 22000 questions
procesed 23000 questions
procesed 24000 questions
procesed 2