In [1]:
import re
import os
import numpy as np
import json
import pickle
import datetime
import spacy
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize,sent_tokenize
from gensim.models import Word2Vec
from load_squad_wiki_data import get_squad_wiki_data
nlp = spacy.load('en', parser=False, entity=False, matcher=False, add_vectors=False)

Using TensorFlow backend.


In [9]:
class Embeddings:
    def __init__(self, size, window, min_count, workers):
        
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]])
        self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name)
        self.path_indexed_sentences = '../data/indexed_sentences_{0}.json'.format(base_file_name)
        self.path_word_embeddings = '../data/word_embeddings_{0}.npz'.format(base_file_name)
        self.path_indexed_vocabulary = '../data/indexed_vocabulary_{0}.json'.format(base_file_name)
        self.path_pos_categorical_indexed_sentences = '../data/pos_categorical_indexed_sentences_{0}.json'.format(base_file_name)
        self.path_pos_indexed_vocabulary = '../data/pos_indexed_vocabulary_{0}.json'.format(base_file_name)
        self.load_embeddings()
    
    def tokenize_sentence(self, raw_text):
        sentences = sent_tokenize(raw_text)
        sentences = [re.sub(r'[^\w\'\+\-\=\*\s\^]', '', sent) for sent in sentences]
        tokenized_sentences = [word_tokenize(sent) for sent in sentences]
        return tokenized_sentences
    
    def tokenize_index_sentence(self, sentence):
        word2index, index2word = self.get_vocabulary()
        tokenized_sentences = self.tokenize_sentence(sentence.lower())
        indexed_sentences = [[word2index[word] for word in sent] for sent in tokenized_sentences]
        return indexed_sentences
        
        
    def tag_sentence(self, text):
        tokenized_sentences = self.tokenize_sentence(text.lower())
        tokenized_pos_sentences = self.find_POS(tokenized_sentences)
        return tokenized_pos_sentences
    
    def find_POS(self, tokenized_sentences):
        final_pos_sents = []
        for sent in tokenized_sentences:
            doc = nlp(' '.join(sent))
            pos = []
            for word in doc:
                pos.append(word.pos_)
            final_pos_sents.append(pos)
        return final_pos_sents 

        
    def preprocessor(self, raw_text, size, window, min_count, workers):  
        print("Creating tokenized sentences")
        tokenized_sentences = self.tokenize_sentence(raw_text)
        print("Creating pos tokenized sentences")
        tokenized_pos_sentences = self.find_POS(tokenized_sentences)
        print("POS Tokenization Complete")
        vocab = ['PUNCT','SYM','X','ADJ','VERB','CONJ','NUM','DET','ADV','PROPN','NOUN','PART','INTJ','CCONJ','SPACE','ADP','SCONJ','AUX', 'PRON']
        vocab = dict((word, index) for index, word in enumerate(vocab))
        with open(self.path_pos_indexed_vocabulary,'w') as outfile:
            json.dump(vocab, outfile)
        print("Categorizing Sentence")
        categorical_pos_sentences = [to_categorical([vocab[word] for word in sent], num_classes = len(vocab)).tolist() for sent in tokenized_pos_sentences] 
        print("Saving the Categorical POS Sentences into file")
        with open(self.path_pos_categorical_indexed_sentences,'w') as outfile:
            json.dump(categorical_pos_sentences, outfile) 
        print("Categorical File Created With Data")    
        # initialize word2vector model
        model = Word2Vec(sentences = tokenized_sentences, size = size, window = window, min_count = min_count, workers = workers)
        # finding out the vocabulary of raw_text with index     
        vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
        # Storeing the vocab2index in a seperate file
        print("Saving Vocabulary Sentences into file")
        with open(self.path_indexed_vocabulary,'w') as outfile:
            json.dump(vocab,outfile)
        # replacing each word of tokenized_sentences by coressponding vocabulary index
        indexed_sentences = [[vocab[word] for word in sent] for sent in tokenized_sentences]
        # storeing indexed_sentences(tokenized sentences) in indexed.json file
        print("Saving Indexed Sentences into file")
        with open(self.path_indexed_sentences,'w') as outfile:
            json.dump(indexed_sentences,outfile)
        # finding gensim weights
        weights = model.wv.syn0
        # storeing weights in wordembeddings.npz file
        print("Saving Word Embeddings into the file")
        np.save(open(self.path_word_embeddings, 'wb'), weights)
        # dump the word2vec model in dump file word2vec_model
        print("Saving Model into the file")
        with open(self.path_word2vec_model, 'wb') as output:
            pickle.dump(model, output)

    def load_embeddings(self):
        if not (os.path.isfile(self.path_word2vec_model) or 
                os.path.isfile(self.path_word_embeddings) or 
                os.path.isfile(self.path_indexed_sentences) or
                os.path.isfile(self.path_indexed_vocabulary) or
                os.path.isfile(self.path_pos_indexed_vocabulary) or
                os.path.isfile(self.path_pos_categorical_indexed_sentences)):
            dataset = get_squad_wiki_data()
            raw_text = ""
            passage_text = "" 
            question_text = ""
            passage_text_list = []
            question_text_list = []
            for data in dataset:
                passage_text_list.append(data['Paragraph'])
                question_text_list.extend(data['Question'])                
            passage_text = "".join(passage_text_list)
            question_text = ".".join(question_text_list)
            raw_text = passage_text + " " + question_text
            raw_text = raw_text.lower()
            self.preprocessor(raw_text, self.size, self.window, self.min_count, self.workers)        

    # Will load and return weights from the existing embedding.npz file
    def get_weights(self):
        weights = np.load(open(self.path_word_embeddings,'rb'))
        return weights

    # Returns word2Index and index2word
    def get_vocabulary(self):
        with open(self.path_indexed_vocabulary, 'r') as f:
            data = json.load(f)
        word2idx = data
        idx2word = dict([(v, k) for k, v in data.items()])
        return word2idx, idx2word

    # Returns the pickled model
    def get_model(self):
        with open(self.path_word2vec_model,'rb') as output:
            model = pickle.load(output)
        return model

    # Returns the tokenized sentences
    def get_tokenized_indexed_sentences(self):
        with open(self.path_indexed_sentences, 'r') as f:
            tokenized_sentences = json.load(f)
        return tokenized_sentences
    
    # Returns pos2Index and index2pos
    def get_pos_vocabulary(self):
        with open(self.path_pos_indexed_vocabulary, 'r') as f:
            data = json.load(f)
        pos2idx = data
        idx2pos = dict([(v, k) for k, v in data.items()])
        return pos2idx, idx2pos
    
    # Returns the tokenized pos sentences
    def get_pos_categorical_indexed_sentences(self):
        with open(self.path_pos_categorical_indexed_sentences, 'r') as f:
            tokenized_pos_sentences = json.load(f)
        return tokenized_pos_sentences

In [10]:
start_date = datetime.datetime.now()
e = Embeddings(100, 4, 1, 4)
end_date = datetime.datetime.now()
#print("TOTAL TIME ELAPSED IN EMBEDDINGS")
#print(((end_date - start_date).hour)," HOURS ",((end_date - start_date).minute)," MINUTES ",((end_date - start_date).second)," SECONDS ")

In [11]:
# e.get_model()

<gensim.models.word2vec.Word2Vec at 0x7f33a96126a0>

In [12]:
# e.get_weights()

array([[  2.42574263e+00,  -7.58274868e-02,  -6.17127478e-01, ...,
         -2.86359102e-01,  -1.96607578e+00,  -2.54467607e+00],
       [  3.47208172e-01,  -9.91447628e-01,  -6.97705746e-01, ...,
          1.10680330e+00,   3.21801722e-01,  -2.49307728e+00],
       [ -5.74008584e-01,   5.17570734e-01,  -1.22353244e+00, ...,
          2.02407658e-01,  -9.38207388e-01,  -3.50373316e+00],
       ..., 
       [  8.78951419e-03,  -1.43301915e-02,  -2.35296413e-02, ...,
         -1.44301169e-02,   1.30269416e-02,   4.81675612e-03],
       [  1.81394983e-02,  -2.84149162e-02,   5.21293841e-03, ...,
          1.09690642e-02,  -5.21929935e-03,   4.59477305e-05],
       [  1.39771681e-03,   8.43398180e-03,   3.12235020e-03, ...,
          4.82686237e-03,  -1.13214366e-02,  -7.40344124e-03]], dtype=float32)

In [13]:
# e.get_vocabulary()

({'taierzhuang': 157828,
  'geetduring': 84325,
  'moselywhat': 136094,
  'artesis': 148305,
  'istanbulpersecution': 85784,
  'baden-powell': 68501,
  'pizzorno': 78238,
  'reactionswhile': 144081,
  'billionbetween': 117060,
  'objectionable': 34105,
  'yüeh-nü': 130718,
  'duct': 38249,
  'emmett': 38280,
  'bolter': 128653,
  'vrightmathrm': 125907,
  'xsys': 170739,
  'dyrrachiumwhere': 113923,
  'centure': 83199,
  'progressivist': 58259,
  '2000sindoor': 138580,
  'sycamore': 35499,
  '2012-2015': 126626,
  'playin': 29636,
  'densest': 20415,
  'raheen': 84082,
  'ānanda': 52726,
  'os2': 74641,
  'cabinetin': 86615,
  'melts': 23067,
  'kcrg': 119991,
  'owned': 1403,
  'neededmelbourne': 114422,
  'israellife': 84013,
  'cantonalist': 71115,
  'cornes': 95962,
  'ladybirds': 161309,
  'saviors': 104427,
  '5150722n': 118482,
  'frustrated': 11500,
  'complicating': 40863,
  'multiethnic': 59223,
  'talkswhat': 45109,
  'buffered': 47326,
  'solarare': 88058,
  'alsace-champag

In [14]:
# e.get_tokenized_indexed_sentences()

[[502,
  644,
  895,
  7,
  23,
  105,
  384,
  341,
  4,
  1711,
  0,
  4516,
  1,
  0,
  104,
  384,
  260,
  2756,
  9,
  0,
  762,
  257],
 [0,
  105,
  384,
  1167,
  8987,
  4516,
  4024,
  3407,
  1184,
  0,
  104,
  384,
  1167,
  7780,
  4516,
  880,
  3211,
  29663,
  4,
  3988,
  32,
  309,
  502,
  644,
  604],
 [0,
  341,
  7,
  444,
  14,
  681,
  913,
  2261,
  22,
  8849,
  10,
  1157,
  2,
  0,
  454,
  3009,
  1521,
  112,
  22,
  1850,
  13717,
  890],
 [8,
  28,
  7,
  0,
  6233,
  502,
  644,
  0,
  260,
  5041,
  0,
  2181,
  2844,
  13,
  279,
  39055,
  5602,
  8,
  122,
  8,
  4639,
  16075,
  0,
  987,
  1,
  5646,
  134,
  502,
  644,
  341,
  13,
  214,
  8894,
  89,
  20,
  0,
  341,
  67,
  27,
  46,
  76,
  8,
  502,
  644,
  3641,
  180,
  11,
  0,
  3105,
  123,
  8944,
  873,
  0,
  2046,
  8894,
  52072,
  3211,
  3066,
  0,
  1317,
  257,
  13,
  5,
  11552,
  457,
  3,
  6169,
  10334,
  2197,
  7,
  382,
  0,
  2756,
  40,
  4689,
  1388,
  10358],

In [15]:
# print(e.tokenize_index_sentence("this is Nikola Tesla"))
# e.tag_sentence("this is nikola tesla")

[[28, 6, 6261, 373]]


[['DET', 'VERB', 'NOUN', 'NOUN']]

In [None]:
#vocab = ['PUNCT','SYM','X','ADJ','VERB','CONJ','NUM','DET','ADV','PROPN','NOUN','PART','INTJ','CCONJ','','']