In [70]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
import logging

Setup logging for word2vec

In [71]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [2]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 5
MIN_SIZE = 0
NUM_CORES = 7

In [3]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

In [5]:
DOC2VEC_SEED = 1234

In [74]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"
training_file = "/home/local/shalaby/docs_output.json"
doc_classifications_map_file = "/home/local/shalaby/doc_classification_map.pkl"
sections_file = "/home/local/shalaby/sections.pkl"
training_docs_list_file = "/home/local/shalaby/training_docs_list.pkl"

In [11]:
doc_classifications_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
training_docs_list = pickle.load(open(training_docs_list_file))
#open(sections_file).read()

In [12]:
len(training_docs_list)

1286325

In [13]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename):
            (doc_id, text) = eval(line)
            if doc_id in training_docs_list:
                yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])

In [17]:
DOC2VEC_SIZE = 400
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 0
DOC2VEC_EPOCHS = 1
REPORT_DELAY = 30 # report the progress every x seconds

#### Load the model with the Vocab

In [16]:
file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,10)
loaded_model = Doc2Vec.load('/home/local/shalaby/models/{}'.format(file_name))

#### Initialize the model using the vocab from the previously trained model

In [75]:
model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # dm_mean=0 => use sum, dm_mean=1 => use mean
                dm_mean=DOC2VEC_MEAN,
                # would train words with skip-gram on top of cbow, we don't need that
                dbow_words=0,
                iter=DOC2VEC_EPOCHS)

In [76]:
print model

Doc2Vec(dm/s,d400,n10,w8,mc5,s1e-05,t7)


In [77]:
model.reset_from(loaded_model)

2016-08-17 04:23:25,409 : INFO : resetting layer weights


In [78]:
len(model.vocab)

6147817

#### Now for the actual training

In [None]:
%%time
model.train(sentences=LabeledLineSentence(training_file), report_delay=REPORT_DELAY)

2016-08-17 04:32:33,683 : INFO : training model with 7 workers on 6147817 vocabulary and 400 features, using sg=0 hs=0 sample=1e-05 negative=10
2016-08-17 04:32:33,685 : INFO : expecting 2009750 sentences, matching count from corpus used for vocabulary survey
2016-08-17 04:33:06,606 : INFO : PROGRESS: at 0.00% examples, 54 words/s, in_qsize 14, out_qsize 0
2016-08-17 04:33:36,630 : INFO : PROGRESS: at 0.01% examples, 4056 words/s, in_qsize 0, out_qsize 0
2016-08-17 04:34:06,686 : INFO : PROGRESS: at 0.02% examples, 5149 words/s, in_qsize 0, out_qsize 0
2016-08-17 04:34:36,911 : INFO : PROGRESS: at 0.02% examples, 5741 words/s, in_qsize 0, out_qsize 1
2016-08-17 04:35:06,954 : INFO : PROGRESS: at 0.03% examples, 6358 words/s, in_qsize 0, out_qsize 0
2016-08-17 04:35:37,019 : INFO : PROGRESS: at 0.04% examples, 6829 words/s, in_qsize 0, out_qsize 0
2016-08-17 04:36:07,051 : INFO : PROGRESS: at 0.05% examples, 7046 words/s, in_qsize 0, out_qsize 0
2016-08-17 04:36:37,883 : INFO : PROGRESS

In [81]:
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
model.save('/home/local/shalaby/models/{}'.format(file_name))

2016-08-20 14:30:44,678 : INFO : saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10, separately None
2016-08-20 14:30:44,681 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy
2016-08-20 14:30:48,758 : INFO : storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn1neg.npy
2016-08-20 14:31:02,436 : INFO : not storing attribute syn0norm
2016-08-20 14:31:02,439 : INFO : storing numpy array 'syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn0.npy
2016-08-20 14:31:14,281 : INFO : not storing attribute cum_table


In [82]:
model.most_similar('08887671')

2016-08-20 15:01:40,241 : INFO : precomputing L2-norms of word weight vectors


KeyError: "word '08887671' not in vocabulary"

In [93]:
model.docvecs['08369259']

array([-0.11149903,  0.05847064,  0.0326901 , -0.05546045, -0.01718644,
        0.16179059, -0.10471547, -0.17647769,  0.1070466 , -0.11702061,
        0.0485912 ,  0.10199197, -0.03351878, -0.1790764 , -0.03575905,
        0.01357079, -0.11842816,  0.02276516, -0.09318489,  0.02130031,
       -0.00453636, -0.10888872,  0.01349508,  0.0416873 , -0.00811389,
       -0.12875612,  0.04593304, -0.01775896,  0.01702514, -0.09357604,
        0.00936624,  0.01058741,  0.06517494, -0.08417409, -0.13268793,
       -0.0264844 , -0.00254485,  0.02812302,  0.0264738 , -0.02987128,
       -0.07826719, -0.05308362,  0.01408767, -0.04041473,  0.03358708,
        0.03331527, -0.0818525 , -0.09876713,  0.04341355,  0.0136116 ,
       -0.04311142, -0.05834384, -0.01139861,  0.00325523,  0.01444023,
       -0.02933281,  0.0692323 , -0.0108496 , -0.07619328, -0.03259553,
        0.02600448, -0.07991111,  0.04720963,  0.00866228,  0.08713121,
       -0.06486029, -0.02230316, -0.04130165,  0.023275  , -0.05

In [84]:
training_docs_list[:10]

[u'08369259',
 u'07333409',
 u'07333404',
 u'07333405',
 u'07333406',
 u'08369256',
 u'07333400',
 u'08369250',
 u'07333402',
 u'07333403']

In [116]:
len(loaded_model.vocab)

6147817

In [99]:
from collections import defaultdict
model.raw_vocab = defaultdict(int)

In [100]:
model.raw_vocab

defaultdict(int, {})

In [83]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__module__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_do_train_job',
 '_load_specials',
 '_raw_word_count',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'cbow_mean',
 'clear_sims',
 'comment',
 'corpus_count',
 'create_binary_tree',
 'dbow',
 'dbow_words',
 'dm',
 'dm_concat',
 'dm_tag_count',
 'docvecs',
 'doesnt_match',
 'estimate_memory',
 'finalize_vocab',
 'hashfxn',
 'hs',
 'index2word',
 'infer_vector',
 'init_sims',
 'intersect_word2vec_format',
 'iter',
 'layer1_size',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'make_cum_table',
 'max_vocab_size',
 'min_alpha',
 'min_alph

In [95]:
len(model.vocab)

6147817

In [96]:
model.wmdistance

<bound method Doc2Vec.wmdistance of <gensim.models.doc2vec.Doc2Vec object at 0x7fd85f134210>>