In [19]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
import logging

In [20]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [6]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 5
MIN_SIZE = 0
NUM_CORES = 7

In [7]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

In [8]:
DOC2VEC_SEED = 1234

In [26]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"
training_file = "/home/local/shalaby/docs_output.json"
doc_classifications_map_file = "/home/local/shalaby/doc_classification_map.pkl"
sections_file = "/home/local/shalaby/sections.pkl"
training_docs_list_file = "/home/local/shalaby/training_docs_list.pkl"

In [10]:
doc_classifications_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
training_docs_list = pickle.load(open(training_docs_list_file))
#open(sections_file).read()

In [11]:
len(training_docs_list)

1286325

In [12]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename):
            (doc_id, text) = eval(line)
            if doc_id in training_docs_list:
                yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])

In [32]:
DOC2VEC_SIZE = 400
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = 300000
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 1
DOC2VEC_MEAN = 0
DOC2VEC_EPOCHS = 1
REPORT_DELAY = 30 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 10000 # report the progress every x terms

#### Load the model with the Vocab

In [114]:
file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,10)
loaded_model = Doc2Vec.load('/home/local/shalaby/models/{}'.format(file_name))

2016-08-16 15:45:13,435 : INFO : loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10
loading Doc2Vec object from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10
2016-08-16 15:45:59,375 : INFO : loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.* with mmap=None
loading docvecs recursively from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.* with mmap=None
2016-08-16 15:45:59,378 : INFO : loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=None
loading doctag_syn0 from /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=None


### Create the model

In [33]:
model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that
                dbow_words=0,
                iter=DOC2VEC_EPOCHS)

In [34]:
print model

Doc2Vec(dm/c,d400,n10,w8,mc5,s1e-05,t7)


#### Initialize the model using the vocab from the previously trained model

In [117]:
model.reset_from(loaded_model)

2016-08-16 15:49:28,921 : INFO : using concatenative 6800-dimensional layer1
using concatenative 6800-dimensional layer1
2016-08-16 15:49:28,925 : INFO : resetting layer weights
resetting layer weights


#### Now for the actual training

In [None]:
%%time
model.build_vocab(sentences=LabeledLineSentence(training_file), progress_per=REPORT_VOCAB_PROGRESS)

2016-08-21 05:36:43,143 : INFO : collecting all words and their counts
2016-08-21 05:36:43,576 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2016-08-21 05:48:48,018 : INFO : pruned out 0 tokens with count <=1 (before 302522, after 302522)
2016-08-21 05:48:48,645 : INFO : pruned out 159875 tokens with count <=2 (before 302570, after 142695)
2016-08-21 05:57:51,459 : INFO : pruned out 161713 tokens with count <=3 (before 300024, after 138311)
2016-08-21 06:05:46,060 : INFO : pruned out 164238 tokens with count <=4 (before 300203, after 135965)
2016-08-21 06:10:39,585 : INFO : PROGRESS: at example #10000, processed 47284481 words (23224/s), 213774 word types, 10000 tags
2016-08-21 06:15:42,454 : INFO : pruned out 166772 tokens with count <=5 (before 301439, after 134667)
2016-08-21 06:25:33,615 : INFO : pruned out 166994 tokens with count <=6 (before 300411, after 133417)
2016-08-21 06:34:05,011 : INFO : pruned out 169278 tokens with count <=7 (before 300

CPU times: user 2d 23h 43min 39s, sys: 31min 49s, total: 3d 15min 28s
Wall time: 3d 11min 52s


In [37]:
# file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
#                                                                 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
#                                                                 DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_neg_{}_vocabsize_{}_iter_{}_vocab_only'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE), DOC2VEC_EPOCHS)
model.save('/home/local/shalaby/models/{}'.format(file_name))

2016-08-24 23:42:55,721 : INFO : saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_neg_10_vocabsize_300000_iter_1_vocab_only, separately None
2016-08-24 23:42:55,723 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_neg_10_vocabsize_300000_iter_1_vocab_only.docvecs.doctag_syn0.npy
2016-08-24 23:42:58,412 : INFO : storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_neg_10_vocabsize_300000_iter_1_vocab_only.syn1neg.npy
2016-08-24 23:43:01,510 : INFO : not storing attribute syn0norm
2016-08-24 23:43:01,511 : INFO : storing numpy array 'syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_neg_10_vocabsize_300000_iter_1_vocab_only.syn0.npy
2016-08-24 23:43:01,730 : INFO : not storing attribute cum_table


In [None]:
%%time
model.train(sentences=LabeledLineSentence(training_file), report_delay=REPORT_DELAY)

2016-08-24 23:45:02,265 : INFO : training model with 7 workers on 102129 vocabulary and 6800 features, using sg=0 hs=0 sample=1e-05 negative=10
2016-08-24 23:45:02,266 : INFO : expecting 1286325 sentences, matching count from corpus used for vocabulary survey


In [None]:
# file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
#                                                                 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
#                                                                 DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_neg_{}_vocabsize_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE), DOC2VEC_EPOCHS)
model.save('/home/local/shalaby/models/{}'.format(file_name))

In [76]:
model.most_similar('08887671')

2016-08-16 14:34:05,764 : INFO : precomputing L2-norms of word weight vectors
precomputing L2-norms of word weight vectors


AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [77]:
model['08887671']

AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [116]:
len(loaded_model.vocab)

6147817

In [99]:
from collections import defaultdict
model.raw_vocab = defaultdict(int)

In [100]:
model.raw_vocab

defaultdict(int, {})

In [83]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__module__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_do_train_job',
 '_load_specials',
 '_raw_word_count',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'cbow_mean',
 'clear_sims',
 'comment',
 'corpus_count',
 'create_binary_tree',
 'dbow',
 'dbow_words',
 'dm',
 'dm_concat',
 'dm_tag_count',
 'docvecs',
 'doesnt_match',
 'estimate_memory',
 'finalize_vocab',
 'hashfxn',
 'hs',
 'index2word',
 'infer_vector',
 'init_sims',
 'intersect_word2vec_format',
 'iter',
 'layer1_size',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'make_cum_table',
 'max_vocab_size',
 'min_alpha',
 'min_alph

In [95]:
len(model.vocab)

6147817

In [96]:
model.wmdistance

<bound method Doc2Vec.wmdistance of <gensim.models.doc2vec.Doc2Vec object at 0x7fd85f134210>>