## Creates the doc2vec vector embeddings for a specific configuration

In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import gzip

from multiprocessing import Process, Queue

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *
from thesis.utils.file import *

## Global variables used throughout the script

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234

In [20]:
MIN_WORD_COUNT = 100
NUM_CORES = 8

In [5]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [6]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_DICT = "validation_dict.pkl"
TEST_MATRIX = "test_matrix.pkl"
TEST_DICT = "test_dict.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"

In [7]:
root_location = "/mnt/data/shalaby/"
# exports_location = root_location + "exported_data/"
exports_location = "/mnt/data2/shalaby/" + "exported_data/"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/extended_pv_abs_desc_claims_full_chunks/"

training_preprocessed_files_prefix = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
validation_preprocessed_files_prefix = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
test_preprocessed_files_prefix = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

## Load general data required for classification

In [8]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 19.7 s, sys: 1.35 s, total: 21 s
Wall time: 21 s


In [9]:
len(training_docs_list)

1286325

In [10]:
len(validation_docs_list)

321473

In [11]:
len(test_docs_list)

401877

# Utility functions for data loading

In [12]:
VALIDATION_MINI_BATCH_SIZE = 100000
def get_extended_docs_with_inference_data_only(doc2vec_model, file_to_write, preprocessed_files_prefix):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation or test documents
    """

    def infer_one_doc(doc_tuple):
        # doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
        doc_id, doc_tokens = doc_tuple
        rep = doc2vec_model.infer_vector(doc_tokens)
        return (doc_id, rep)

    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write)):
        info("===== Loading inference vectors")
        inference_documents_reps = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write)))
        info("Loaded inference vectors matrix")
    else:
        inference_documents_reps = {}
        info("===== Getting vectors with inference")

        # Multi-threaded inference
#         inference_docs_iterator = ExtendedPVDocumentBatchGenerator(preprocessed_files_prefix, batch_size=None)
        inference_docs_iterator = BatchWrapper(preprocessed_files_prefix, batch_size=None)
        generator_func = inference_docs_iterator.__iter__()
        pool = ThreadPool(NUM_CORES)
        # map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
        mini_batch_size = VALIDATION_MINI_BATCH_SIZE
        batches_run = 1
        while True:
            threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
            info("Finished: {} tags".format(batches_run * mini_batch_size))
            batches_run += 1
            if threaded_reps_partial:
                # threaded_reps.extend(threaded_reps_partial)
                inference_documents_reps.update(threaded_reps_partial)
            else:
                break

        pickle.dump(inference_documents_reps,
                    open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write), 'w'))

    return inference_documents_reps

In [13]:
class ExtendedPVDocumentBatchGenerator(Process):
    def __init__(self, filename_prefix, queue, batch_size=10000, start_file=0, offset=10000):
        super(ExtendedPVDocumentBatchGenerator, self).__init__()
        self.queue = queue
        self.offset = offset
        self.filename_prefix = filename_prefix
        self.files_loaded = start_file - offset

    def run(self):
        cur_file = None
        while True:
            try:
                if cur_file is None:
                    info("Loading new file for index: {}".format(str(self.files_loaded + self.offset)))
                    cur_file = gzip.open(self.filename_prefix + str(self.files_loaded + self.offset) + '.gz')
#                     cur_file = open(self.filename_prefix + str(self.files_loaded + self.offset))
                    self.files_loaded += self.offset
                for line in cur_file:
                    self.queue.put(line)
                cur_file.close()
                cur_file = None
            except IOError:
                self.queue.put(False, block=True, timeout=None)
                info("All files are loaded - last file: {}".format(str(self.files_loaded + self.offset)))
                return


class BatchWrapper(object):
    def __init__(self, training_preprocessed_files_prefix, buffer_size=10000, batch_size=10000, level=1, level_type=None):
        assert batch_size <= 10000 or batch_size is None
        self.level = level
        self.level_type = level_type[0]
        self.batch_size = batch_size
        self.q = Queue(maxsize=buffer_size)
        self.p = ExtendedPVDocumentBatchGenerator(training_preprocessed_files_prefix, queue=self.q,
                                                  batch_size=batch_size, start_file=0, offset=10000)
        self.p.start()
        self.cur_data = []

    def is_correct_type(self, doc_id):
        parts = doc_id.split("_")
        len_parts = len(parts)
        if len_parts == self.level:
            if len_parts == 1:
                return True
            if len_parts == self.level and (parts[1][0] == self.level_type or self.level_type is None):
                return True
        return False

    def return_sentences(self, line):
        line_array = tuple(line.split(" "))
        doc_id = line_array[0]
        if not self.is_correct_type(doc_id):
            return False
        line_array = line_array[1:]
        len_line_array = len(line_array)
        curr_batch_iter = 0
        # divide the document to batches according to the batch size
        sentences = []
        while curr_batch_iter < len_line_array:
            sentences.append(LabeledSentence(words=line_array[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id]))
            curr_batch_iter += self.batch_size
        return tuple(sentences)

    def __iter__(self):
        while True:
            item = self.q.get(block=True)
            if item is False:
                raise StopIteration()
            else:
                sentences = self.return_sentences(item)
                if not sentences:
                    None
                else:
                    for sentence in sentences:
                        yield sentence

# Doc2vec and SVM Parameters

In [14]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

## Create the Doc2vec model and create/load the vocab

In [15]:
models = [
    (3, 'claims')
]
level, model_name = models[0]

In [21]:
info("creating/loading vocabulary for " + str(level) + ' ' + model_name + ' in ')
doc2vec_model_save_location = os.path.join(root_location,
                                           "parameter_search_doc2vec_models_" + str(level) + '_' + model_name,
                                           "full")
if not os.path.exists(doc2vec_model_save_location):
    os.makedirs(doc2vec_model_save_location)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
    os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(DOC2VEC_SIZE,
                                                                DOC2VEC_WINDOW,
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE),
                                                                str(level) + '_' + model_name
                                                                )
GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
info("FILE " + os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE, window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT,
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that for now
                dbow_words=DOC2VEC_TRAIN_WORDS,
                iter=DOC2VEC_EPOCHS)

GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
    
    info("Creating vocab model")
    training_docs_iterator = BatchWrapper(training_preprocessed_files_prefix, batch_size=10000, level=level,
                                          level_type=model_name)
    doc2vec_model.build_vocab(sentences=training_docs_iterator, progress_per=REPORT_VOCAB_PROGRESS)
    doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
else:
    info("Loading vocab model")
    doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model.reset_from(doc2vec_model_vocab_model)

2017-04-09 18:10:38,746 : INFO : creating/loading vocabulary for 3 claims in 
2017-04-09 18:10:38,749 : INFO : FILE /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/vocab_model/model
2017-04-09 18:10:38,752 : INFO : Loading vocab model
2017-04-09 18:10:38,753 : INFO : loading Doc2Vec object from /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/vocab_model/model
2017-04-09 18:11:05,017 : INFO : loading docvecs recursively from /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/vocab_model/model.docvecs.* with mmap=None
2017-04-09 18:11:05,019 : INFO : loading doctag_syn0 from /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/vocab_model/model.docvecs.doctag_syn0.npy with mmap=None
2017-04-09 18:11:12,531 : INFO : loading wv recursively from /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/vocab_model/model.wv.* with mmap=None
2017-04-09 18:11:12,532 : INFO : loading syn0 from /mnt/data/shalaby/parameter_search_doc2v

## Actual Training, validation and Metrics Loop

In [22]:
doc2vec_model.min_alpha = 0.025
DOC2VEC_ALPHA_DECREASE = 0.001

In [23]:
doc2vec_model.workers = NUM_CORES

In [None]:
%%time
# when resuming, resume from an epoch with a previously created doc2vec model to get the learning rate right
start_from = 1
for epoch in range(start_from, DOC2VEC_MAX_EPOCHS+1):
    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))
    
    # if we have the model, just load it, otherwise train the previous model
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        doc2vec_model.workers = NUM_CORES
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
    else:
        # train the doc2vec model
        training_docs_iterator = BatchWrapper(training_preprocessed_files_prefix, batch_size=10000, level=level,
                                          level_type=model_name)
        %time doc2vec_model.train(sentences=training_docs_iterator, report_delay=REPORT_DELAY)
        doc2vec_model.alpha -= DOC2VEC_ALPHA_DECREASE  # decrease the learning rate
        doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
        ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
        doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
        
    # only do the inference for higher epochs, as inference usually takes as much time as the actual training
    if epoch > 7:
        # Validation Embeddings
        info('Getting Validation Embeddings')
        Xv = get_extended_docs_with_inference_data_only(doc2vec_model, VALIDATION_DICT, 
                                         validation_preprocessed_files_prefix)

2017-04-09 18:12:43,128 : INFO : ****************** Epoch 1 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_1 *******************
2017-04-09 18:12:43,374 : INFO : training model with 8 workers on 67011 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=10 window=2
2017-04-09 18:12:43,378 : INFO : expecting 5145291 sentences, matching count from corpus used for vocabulary survey
2017-04-09 18:12:43,376 : INFO : Loading new file for index: 0
2017-04-09 18:12:44,387 : INFO : PROGRESS: at 0.01% examples, 88962 words/s, in_qsize 0, out_qsize 0
2017-04-09 18:13:04,477 : INFO : PROGRESS: at 0.36% examples, 155074 words/s, in_qsize 8, out_qsize 0
2017-04-09 18:13:24,553 : INFO : PROGRESS: at 0.73% examples, 160214 words/s, in_qsize 15, out_qsize 0
2017-04-09 18:13:24,882 : INFO : Loading new file for index: 10000
2017-04-09 18:13:44,596 : INFO : PROGRESS: at 1.13% examples, 164978 words/s, in_qsize 13, out

CPU times: user 2h 40min 26s, sys: 16min 2s, total: 2h 56min 29s
Wall time: 1h 26min 7s


2017-04-09 19:38:54,994 : INFO : not storing attribute syn0norm
2017-04-09 19:38:54,995 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_1/model.wv.syn0.npy
2017-04-09 19:38:55,028 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_1/model.syn1neg.npy
2017-04-09 19:38:55,059 : INFO : not storing attribute cum_table
2017-04-09 19:39:37,369 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_1/model
2017-04-09 19:39:37,372 : INFO : ****************** Epoch 2 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 43min 40s, sys: 17min 17s, total: 3h 57s
Wall time: 1h 27min 46s


2017-04-09 21:07:27,003 : INFO : not storing attribute syn0norm
2017-04-09 21:07:27,004 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_2/model.wv.syn0.npy
2017-04-09 21:07:27,039 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_2/model.syn1neg.npy
2017-04-09 21:07:27,070 : INFO : not storing attribute cum_table
2017-04-09 21:08:16,933 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_2/model
2017-04-09 21:08:16,935 : INFO : ****************** Epoch 3 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 42min 5s, sys: 17min 53s, total: 2h 59min 59s
Wall time: 1h 28min 50s


2017-04-09 22:37:11,326 : INFO : not storing attribute syn0norm
2017-04-09 22:37:11,326 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_3/model.wv.syn0.npy
2017-04-09 22:37:11,361 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_3/model.syn1neg.npy
2017-04-09 22:37:11,392 : INFO : not storing attribute cum_table
2017-04-09 22:37:55,133 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_3/model
2017-04-09 22:37:55,138 : INFO : ****************** Epoch 4 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 40min 10s, sys: 17min 31s, total: 2h 57min 42s
Wall time: 1h 28min 8s


2017-04-10 00:06:06,837 : INFO : not storing attribute syn0norm
2017-04-10 00:06:06,839 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_4/model.wv.syn0.npy
2017-04-10 00:06:06,874 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_4/model.syn1neg.npy
2017-04-10 00:06:06,905 : INFO : not storing attribute cum_table
2017-04-10 00:06:57,716 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_4/model
2017-04-10 00:06:57,718 : INFO : ****************** Epoch 5 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 40min 17s, sys: 17min 14s, total: 2h 57min 31s
Wall time: 1h 27min 28s


2017-04-10 01:34:30,016 : INFO : not storing attribute syn0norm
2017-04-10 01:34:30,017 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_5/model.wv.syn0.npy
2017-04-10 01:34:30,049 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_5/model.syn1neg.npy
2017-04-10 01:34:30,079 : INFO : not storing attribute cum_table
2017-04-10 01:35:12,050 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_5/model
2017-04-10 01:35:12,052 : INFO : ****************** Epoch 6 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 29min 19s, sys: 15min 36s, total: 2h 44min 55s
Wall time: 1h 22min 32s


2017-04-10 02:57:48,015 : INFO : not storing attribute syn0norm
2017-04-10 02:57:48,017 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_6/model.wv.syn0.npy
2017-04-10 02:57:48,078 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_6/model.syn1neg.npy
2017-04-10 02:57:48,123 : INFO : not storing attribute cum_table
2017-04-10 02:58:29,456 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_6/model
2017-04-10 02:58:29,459 : INFO : ****************** Epoch 7 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 37min 44s, sys: 16min 46s, total: 2h 54min 30s
Wall time: 1h 28min 37s


2017-04-10 04:27:09,910 : INFO : not storing attribute syn0norm
2017-04-10 04:27:09,911 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_7/model.wv.syn0.npy
2017-04-10 04:27:09,945 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_7/model.syn1neg.npy
2017-04-10 04:27:09,975 : INFO : not storing attribute cum_table
2017-04-10 04:27:58,603 : INFO : saved /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_7/model
2017-04-10 04:27:58,605 : INFO : ****************** Epoch 8 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabs

CPU times: user 2h 50min 27s, sys: 15min 52s, total: 3h 6min 19s

2017-04-10 06:48:14,241 : INFO : saving Doc2Vec object under /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_8/model, separately None
2017-04-10 06:48:14,776 : INFO : storing np array 'doctag_syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_8/model.docvecs.doctag_syn0.npy
2017-04-10 07:04:27,791 : INFO : not storing attribute syn0norm
2017-04-10 07:04:27,811 : INFO : storing np array 'syn0' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_3_claims/epoch_8/model.wv.syn0.npy
2017-04-10 07:04:28,112 : INFO : storing np array 'syn1neg' to /mnt/data/shalaby/parameter_search_doc2vec_models_3_claims/full/doc2vec_size_200_w_2_type_dm_conca


Wall time: 2h 20min 13s


NameError: global name 'BatchClass' is not defined

In [46]:
NUM_CORES = 32

In [47]:
Xv = get_extended_docs_with_inference_data_only(doc2vec_model, VALIDATION_DICT, 
                                         validation_preprocessed_files_prefix)

2017-04-09 00:47:30,381 : INFO : ===== Getting vectors with inference
2017-04-09 00:47:32,316 : INFO : Loading new file for index: 0
2017-04-09 00:48:09,470 : INFO : Finished: 100000 tags
2017-04-09 00:48:46,768 : INFO : Finished: 200000 tags
2017-04-09 00:49:24,072 : INFO : Finished: 300000 tags
2017-04-09 00:49:30,546 : INFO : Loading new file for index: 10000
2017-04-09 00:50:01,360 : INFO : Finished: 400000 tags
2017-04-09 00:50:39,518 : INFO : Finished: 500000 tags
2017-04-09 00:51:17,982 : INFO : Finished: 600000 tags
2017-04-09 00:51:31,647 : INFO : Loading new file for index: 20000
2017-04-09 00:51:56,195 : INFO : Finished: 700000 tags
2017-04-09 00:52:32,945 : INFO : Finished: 800000 tags
2017-04-09 00:53:10,328 : INFO : Finished: 900000 tags
2017-04-09 00:53:47,587 : INFO : Finished: 1000000 tags
2017-04-09 00:53:51,325 : INFO : Loading new file for index: 30000
2017-04-09 00:54:25,132 : INFO : Finished: 1100000 tags
2017-04-09 00:55:03,058 : INFO : Finished: 1200000 tags
201