In [57]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
root = logging.getLogger()
root.addHandler(logging.StreamHandler())

In [106]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 5
MIN_SIZE = 0
NUM_CORES = 7

In [38]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

In [23]:
DOC2VEC_SEED = 1234

In [9]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"
training_file = "/home/local/shalaby/docs_output.json"
doc_classifications_map_file = "/home/local/shalaby/doc_classification_map.pkl"
sections_file = "/home/local/shalaby/sections.pkl"
training_docs_list_file = "/home/local/shalaby/training_docs_list.pkl"

In [10]:
doc_classifications_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
training_docs_list = pickle.load(open(training_docs_list_file))
#open(sections_file).read()

In [135]:
len(training_docs_list)

1286325

In [127]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename):
            (doc_id, text) = eval(line)
            if doc_id in training_docs_list:
                yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])

In [133]:
DOC2VEC_SIZE = 400
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 1
DOC2VEC_MEAN = 0
DOC2VEC_EPOCHS = 1
REPORT_DELAY = 30 # report the progress every x seconds

#### Load the model with the Vocab

In [114]:
file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,10)
loaded_model = Doc2Vec.load('/home/local/shalaby/models/{}'.format(file_name))

2016-08-16 15:45:13,435 : INFO : loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10
loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10
2016-08-16 15:45:59,375 : INFO : loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.* with mmap=None
loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.* with mmap=None
2016-08-16 15:45:59,378 : INFO : loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=None
loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=None


#### Initialize the model using the vocab from the previously trained model

In [107]:
model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that
                dbow_words=0,
                iter=DOC2VEC_EPOCHS)

In [136]:
print model

Doc2Vec(dm/c,d400,n10,w8,mc5,s1e-05,t7)


In [117]:
model.reset_from(loaded_model)

2016-08-16 15:49:28,921 : INFO : using concatenative 6800-dimensional layer1
using concatenative 6800-dimensional layer1
2016-08-16 15:49:28,925 : INFO : resetting layer weights
resetting layer weights


#### Now for the actual training

In [None]:
model.train(sentences=LabeledLineSentence(training_file), report_delay=REPORT_DELAY)

2016-08-16 16:14:12,345 : INFO : training model with 7 workers on 6147817 vocabulary and 6800 features, using sg=0 hs=0 sample=1e-05 negative=10
training model with 7 workers on 6147817 vocabulary and 6800 features, using sg=0 hs=0 sample=1e-05 negative=10
2016-08-16 16:14:12,348 : INFO : expecting 2009750 sentences, matching count from corpus used for vocabulary survey
expecting 2009750 sentences, matching count from corpus used for vocabulary survey
2016-08-16 16:14:15,268 : INFO : PROGRESS: at 0.00% examples, 1123 words/s, in_qsize 0, out_qsize 0
PROGRESS: at 0.00% examples, 1123 words/s, in_qsize 0, out_qsize 0
2016-08-16 16:14:45,317 : INFO : PROGRESS: at 0.01% examples, 6643 words/s, in_qsize 0, out_qsize 0
PROGRESS: at 0.01% examples, 6643 words/s, in_qsize 0, out_qsize 0
2016-08-16 16:15:15,544 : INFO : PROGRESS: at 0.02% examples, 7361 words/s, in_qsize 0, out_qsize 0
PROGRESS: at 0.02% examples, 7361 words/s, in_qsize 0, out_qsize 0


In [134]:
# file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
#                                                                 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
#                                                                 DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
model.save('/home/local/shalaby/models/{}'.format(file_name))

2016-08-20 15:22:45,778 : INFO : saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10, separately None
saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10, separately None
2016-08-20 15:22:45,783 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy
storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy
2016-08-20 15:22:50,011 : INFO : storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.syn1neg.npy
storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.syn1neg.npy
2016-08-20 15:40:39,540 : INFO : not storing attribute syn0norm
not storing attribute syn

In [138]:
model.init_sims(replace=True)
file_name2 = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_n_{}_iter_{}_reduced'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                DOC2VEC_EPOCHS)
model.save('/home/local/shalaby/models/{}'.format(file_name2))

2016-08-23 03:03:26,845 : INFO : precomputing L2-norms of word weight vectors
precomputing L2-norms of word weight vectors
2016-08-23 03:04:54,496 : INFO : saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_n_10_iter_1_reduced, separately None
saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_n_10_iter_1_reduced, separately None
2016-08-23 03:04:54,500 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_n_10_iter_1_reduced.docvecs.doctag_syn0.npy
storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_n_10_iter_1_reduced.docvecs.doctag_syn0.npy
2016-08-23 03:04:58,310 : INFO : storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_n_10_iter_1_reduced.syn1neg.npy
storing numpy array 'syn1neg' to /ho

In [143]:
model.syn1neg

array([[-0.43955073, -1.32353783,  1.2432549 , ..., -0.68703425,
         0.49506971,  0.42362678],
       [-0.23260127, -0.21256062,  0.2494466 , ...,  0.02838594,
         0.22268367, -0.27669066],
       [-0.43055332, -0.31037641, -0.11401954, ...,  1.22877729,
        -1.10776615,  0.32154849],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [144]:
model.infer_vector(['the','moment'])

array([  8.96196347e-03,  -9.76396073e-03,  -6.72388775e-03,
         3.20586981e-03,   4.69769671e-04,   7.34611088e-03,
        -8.80711421e-04,   3.23204976e-03,   3.56474938e-03,
        -3.46648763e-03,   5.18059637e-03,  -8.29966087e-03,
         1.24318302e-02,   5.47847198e-03,  -9.34473053e-03,
        -5.72191086e-03,  -5.87124191e-03,   5.04978746e-03,
         8.31436832e-04,   1.58426035e-02,  -1.14090461e-02,
         5.27314749e-03,   6.38577016e-03,   4.08732332e-03,
        -1.21463761e-02,  -3.54843866e-03,   3.03943735e-03,
        -4.67541954e-03,  -5.42545691e-04,  -3.97595169e-04,
        -6.84142811e-03,  -1.64750451e-03,   3.12010432e-03,
        -1.49104802e-03,  -4.23465436e-03,   1.17774019e-02,
        -1.50803113e-02,   3.81882675e-03,  -6.31576928e-04,
        -8.79900006e-04,  -3.98690812e-03,   3.63986171e-03,
        -4.62985598e-03,   1.51479954e-03,  -3.88556044e-03,
         6.13796571e-03,  -1.66480225e-02,   1.35068782e-03,
        -6.81436248e-03,

In [145]:
file_name3 = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)

non_reduced_loaded_model = Doc2Vec.load('/home/local/shalaby/models/{}'.format(file_name3), mmap='r')

2016-08-24 05:32:03,241 : INFO : loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10
loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10
2016-08-24 05:49:10,762 : INFO : loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.* with mmap=r
loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.* with mmap=r
2016-08-24 05:49:10,766 : INFO : loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=r
loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=r
2016-08-24 05:49:10,808 : INFO : loading syn1neg from /home/local/shalaby/models/doc2vec_size_400_w_8_t

In [146]:
non_reduced_loaded_model.infer_vector(['the','moment'])

array([  1.17399811e-03,  -1.20847160e-03,   2.50299257e-04,
        -1.02183491e-03,  -5.92688273e-04,   4.04232065e-04,
         5.30141813e-04,  -1.21462101e-03,   2.35888321e-04,
        -9.63273633e-04,   4.93177795e-04,   1.06767612e-03,
        -7.77803289e-05,   1.23400812e-03,  -1.05915370e-03,
        -7.65450241e-04,  -4.63319448e-04,   7.90806807e-05,
         8.18747678e-04,  -1.16587011e-03,   3.53409414e-04,
         1.04298000e-03,  -2.02660798e-04,   7.68282101e-04,
         1.61600663e-04,   1.02472585e-03,   6.83164690e-04,
        -7.96813285e-04,  -1.04432495e-03,  -8.92850003e-05,
         8.63626890e-04,   3.66744294e-04,  -1.07038720e-03,
         5.31663420e-04,  -1.05088239e-03,  -2.45142524e-04,
         1.81044001e-04,  -5.31950151e-04,   1.54348527e-04,
         9.70647437e-04,   1.22578791e-03,   2.76911247e-04,
         2.35898275e-04,  -3.16417922e-04,  -8.34109203e-04,
         4.32647823e-04,  -9.00360639e-04,   2.29439989e-04,
        -1.17896858e-03,

In [76]:
model.most_similar('08887671')

2016-08-16 14:34:05,764 : INFO : precomputing L2-norms of word weight vectors
precomputing L2-norms of word weight vectors


AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [77]:
model['08887671']

AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [116]:
len(loaded_model.vocab)

6147817

In [99]:
from collections import defaultdict
model.raw_vocab = defaultdict(int)

In [100]:
model.raw_vocab

defaultdict(int, {})

In [83]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__module__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_do_train_job',
 '_load_specials',
 '_raw_word_count',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'cbow_mean',
 'clear_sims',
 'comment',
 'corpus_count',
 'create_binary_tree',
 'dbow',
 'dbow_words',
 'dm',
 'dm_concat',
 'dm_tag_count',
 'docvecs',
 'doesnt_match',
 'estimate_memory',
 'finalize_vocab',
 'hashfxn',
 'hs',
 'index2word',
 'infer_vector',
 'init_sims',
 'intersect_word2vec_format',
 'iter',
 'layer1_size',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'make_cum_table',
 'max_vocab_size',
 'min_alpha',
 'min_alph

In [95]:
len(model.vocab)

6147817

In [96]:
model.wmdistance

<bound method Doc2Vec.wmdistance of <gensim.models.doc2vec.Doc2Vec object at 0x7fd85f134210>>