In [1]:
import re
import os
import numpy as np
import json
import pickle
from nltk.tokenize import word_tokenize,sent_tokenize
from gensim.models import Word2Vec
from load_squad_wiki_data import get_squad_wiki_data

In [2]:
class Embeddings:
    def __init__(self, size, window, min_count, workers):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]])
        self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name)
        self.path_indexed_sentences = '../data/indexed_sentences_{0}.json'.format(base_file_name)
        self.path_word_embeddings = '../data/word_embeddings_{0}.npz'.format(base_file_name)
        self.path_indexed_vocabulary = '../data/indexed_vocabulary_{0}.json'.format(base_file_name)
        self.load_embeddings()
        
    def preprocessor(self, raw_text, size, window, min_count, workers):
        # removes all the punctuations and retains only alphabets,digits,fullstop,apostrophe
        clean_text = re.sub(r'[^\w\'\.\+\-\=\*\^]',' ' ,raw_text)
        # sentence tokenize clean text
        sentences = sent_tokenize(clean_text)
        # replace full stops from each sentence in sentences list by null
        sentences = [sent.replace('.','') for sent in sentences]
        # word tokenize each sentence in sentences list
        tokenized_sentences = [word_tokenize(sent) for sent in sentences]
        # initialize word2vector model
        model = Word2Vec(sentences = tokenized_sentences, size = size, window = window, min_count = min_count, workers = workers)
        # finding out the vocabulary of raw_text with index     
        vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
        # Storeing the vocab2index in a seperate file
        with open(self.path_indexed_vocabulary,'w') as outfile:
            json.dump(vocab,outfile)
        # replacing each word of tokenized_sentences by coressponding vocabulary index
        indexed_sentences = [[vocab[word] for word in sent] for sent in tokenized_sentences]
        # storeing indexed_sentences(tokenized sentences) in indexed.json file
        with open(self.path_indexed_sentences,'w') as outfile:
            json.dump(indexed_sentences,outfile)
        # finding gensim weights
        weights = model.wv.syn0
        # storeing weights in wordembeddings.npz file
        np.save(open(self.path_word_embeddings, 'wb'), weights)
        # dump the word2vec model in dump file word2vec_model
        with open(self.path_word2vec_model, 'wb') as output:
            pickle.dump(model, output)

    def load_embeddings(self):
        if not (os.path.isfile(self.path_word2vec_model) or 
                os.path.isfile(self.path_word_embeddings) or 
                os.path.isfile(self.path_indexed_sentences) or
                os.path.isfile(self.path_indexed_vocabulary)):
            dataset = get_squad_wiki_data()
            raw_text = ""
            passage_text = "" 
            question_text = ""
            passage_text_list = []
            question_text_list = []
            for data in dataset:
                passage_text_list.append(data['Paragraph'])
                question_text_list.extend(data['Question'])                
            passage_text = "".join(passage_text_list)
            question_text = ".".join(question_text_list)
            raw_text = passage_text + " " + question_text
            self.preprocessor(raw_text, self.size, self.window, self.min_count, self.workers)        

    # Will load and return weights from the existing embedding.npz file
    def get_weights(self):
        weights = np.load(open(self.path_word_embeddings,'rb'))
        return weights

    # Returns word2Index and index2word
    def get_vocabulary(self):
        with open(self.path_indexed_vocabulary, 'r') as f:
            data = json.load(f)
        word2idx = data
        idx2word = dict([(v, k) for k, v in data.items()])
        return word2idx, idx2word

    # Returns the pickled model
    def get_model(self):
        with open(self.path_word2vec_model,'rb') as output:
            model = pickle.load(output)
        return model

    # Returns the tokenized sentences
    def get_tokenized_indexed_sentences(self):
        with open(self.path_indexed_sentences, 'r') as f:
            tokenized_sentences = json.load(f)
        return tokenized_sentences    

In [3]:
e = Embeddings(5,4,1,4)

In [4]:
e.get_model()

<gensim.models.word2vec.Word2Vec at 0x7fbf9d66ec88>

In [5]:
e.get_weights()

array([[ -2.53313613e+00,   2.45333743e+00,   1.27236032e+00,
         -4.47916555e+00,   1.72945094e+00],
       [ -3.42312145e+00,   8.94889355e-01,   6.01290166e-01,
         -4.68936586e+00,   9.33891773e-01],
       [ -3.87542677e+00,   1.94554496e+00,  -1.43224370e+00,
         -2.46077180e+00,   3.95114541e-01],
       ..., 
       [ -7.21099228e-02,  -5.20163681e-03,  -3.47242765e-02,
         -4.18427214e-03,  -1.85724162e-02],
       [ -8.43251646e-02,   7.45845363e-02,  -1.98174249e-02,
         -9.98699889e-02,  -2.32521780e-02],
       [  1.79864168e-02,  -6.41094521e-02,  -1.24483421e-01,
          3.94892655e-02,   3.45118083e-02]], dtype=float32)

In [76]:
e.get_vocabulary()

({'timber': 7320,
  'roam': 19211,
  'Azerbaijan': 19212,
  'BCE': 18588,
  'influence': 694,
  'protestMany': 29291,
  'Pennines': 15818,
  'Phrack': 29292,
  'applicability': 29294,
  'Archduke': 19214,
  'localities': 13166,
  'beef': 13167,
  '+2': 15819,
  'Berger': 29295,
  'colorless': 14523,
  'nodes': 4940,
  'bringing': 2794,
  'Laguerre': 19215,
  'Nationalism': 27431,
  'conservation': 2669,
  'schedule': 2868,
  'Cost': 8505,
  'Seine': 7322,
  'Dirty': 25934,
  'Whole-Earth': 29296,
  'Patents': 29297,
  'typhus': 19259,
  'jailer': 19218,
  'madrassas': 15820,
  'snake': 19219,
  'micrometeoroid': 19220,
  '24-10': 31483,
  'prospect': 9317,
  'London-based': 37052,
  'circa': 4711,
  'academy': 6776,
  'Apotheke': 19221,
  'identification': 6411,
  'catwalk': 29301,
  'tentpole': 19222,
  'Osweiler': 9318,
  'Eucharistic': 29302,
  'West-Central': 18014,
  '1234': 10289,
  'Renmin': 24160,
  'Pilgrim': 12793,
  'joining': 4004,
  'adjuvants': 29304,
  'Ministero': 19224

In [77]:
e.get_tokenized_indexed_sentences()

[[101,
  95,
  231,
  7,
  21,
  172,
  1495,
  347,
  4,
  1480,
  0,
  3631,
  1,
  0,
  236,
  2211,
  1241,
  860,
  11,
  0,
  579,
  301],
 [10,
  172,
  2211,
  689,
  3157,
  3631,
  1065,
  712,
  1551,
  0,
  236,
  2211,
  689,
  2500,
  3631,
  873,
  702,
  1165,
  281,
  4,
  2596,
  33,
  421,
  101,
  95,
  951],
 [10,
  347,
  7,
  659,
  15,
  802,
  412,
  938,
  23,
  3444,
  13,
  1788,
  3,
  0,
  326,
  1862,
  1744,
  1937,
  23,
  2221,
  5926,
  239],
 [223,
  40,
  7,
  0,
  2348,
  101,
  95,
  0,
  1988,
  6246,
  0,
  7403,
  2011,
  16,
  280,
  15076,
  4532,
  8,
  121,
  8,
  5326,
  18145,
  0,
  1469,
  1,
  7251,
  145,
  101,
  95,
  347,
  16,
  525,
  4113,
  123,
  19,
  0,
  347,
  57,
  28,
  43,
  80,
  8,
  101,
  95,
  1831,
  188,
  9,
  0,
  1043,
  131,
  10277,
  1142,
  0,
  4908,
  4113,
  231],
 [10,
  702,
  2402,
  0,
  1216,
  301,
  16,
  5,
  425,
  70,
  995,
  2,
  1981,
  4388,
  580,
  7,
  438,
  0,
  860,
  604,
  7556,
  