In [1]:
import re
import os
import numpy as np
import json
import pickle
from nltk.tokenize import word_tokenize,sent_tokenize
from gensim.models import Word2Vec

In [2]:
def preprocessor(raw_text, size, window, min_count, workers):
    # removes all the punctuations and retains only alphabets,digits,fullstop,apostrophe
    clean_text = re.sub(r'[^\w\'\.\+\-\=\*\^]',' ' ,raw_text)
    # sentence tokenize clean text
    sentences = sent_tokenize(clean_text)
    # replace full stops from each sentence in sentences list by null
    sentences = [sent.replace('.','') for sent in sentences]
    # word tokenize each sentence in sentences list
    tokenized_sentences = [word_tokenize(sent) for sent in sentences]
    # initialize word2vector model
    model = Word2Vec(sentences = tokenized_sentences, size = size, window = window, min_count = min_count, workers = workers)
    # finding out the vocabulary of raw_text with index     
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    # Storeing the vocab2index in a seperate file
    with open('./../data/indexed_vocabulary.json','w') as outfile:
        json.dump(vocab,outfile)
    # replacing each word of tokenized_sentences by coressponding vocabulary index
    indexed_sentences = [[vocab[word] for word in sent] for sent in tokenized_sentences]
    # storeing indexed_sentences(tokenized sentences) in indexed.json file
    with open('./../data/indexed_sentences.json','w') as outfile:
        json.dump(indexed_sentences,outfile)
    # finding gensim weights
    weights = model.wv.syn0
    # storeing weights in wordembeddings.npz file
    np.save(open('./../data/wordembeddings.npz', 'wb'), weights)
    # dump the word2vec model in dump file word2vec_model
    with open("./../data/word2vec_model.pickle", 'wb') as output:
        pickle.dump(model, output)

In [3]:
def final_processing(size, window, min_count, workers):
    with open('./../data/final_json_file.json') as data_file:
            dataset = json.load(data_file)
    raw_text = ""
    passage_text = "" 
    question_text = ""
    passage_text_list = []
    question_text_list = []
    for data in dataset:
        passage_text_list.append(data['Paragraph'])
        question_text_list.extend(data['Question'])                
    passage_text = " ".join(passage_text_list)
    question_text = " ".join(question_text_list)
    raw_text = passage_text + " " + question_text
    preprocessor(raw_text, size, window, min_count, workers)        

In [4]:
# Will load and return weights from the existing embedding.npz file
def get_weights():
    weights = np.load(open('./../data/wordembeddings.npz', 'rb'))
    return weights

In [5]:
# Returns word2Index and index2word
def get_vocabulary():
    with open('./../data/indexed_vocabulary.json', 'r') as f:
        data = json.load(f)
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

In [6]:
# Returns the pickled model
def get_model():
    with open('./../data/word2vec_model.pickle','rb') as output:
        model = pickle.load(output)
    return model

In [7]:
# Returns the tokenized sentences
def get_tokenized_indexed_sentences():
    with open('./../data/indexed_sentences.json', 'r') as f:
        tokenized_sentences = json.load(f)
    return tokenized_sentences    

In [None]:
final_processing(5,4,1,4)

In [None]:
weights = get_weights()

In [None]:
word2index,index2word = get_vocabulary()

In [None]:
word2vec_model = get_model()

In [None]:
tokenized_sentences = get_tokenized_indexed_sentences()

In [None]:
if ((os.path.isfile('./../data/wiki_text.json') and os.access('./../data/wiki_text.json', os.R_OK))
    and (os.path.isfile('./../data/squaddata.json') and os.access('./../data/squaddata.json', os.R_OK))):
    os.system('./../src/load_from_squad_and_wiki.py')
    load_from_squad_and_wiki.merge_file()    
else    
    os.system('./../src/load_from_wiki.py')
    os.system('./../src/load_from_squad_and_wiki.py')
    load_from_wiki.
    load_from_squad_and_wiki.squaddata_processing()
os.system('./../src/data_preprocessing.py')


In [None]:
#execfile('./../src/load_from_wiki.py')