## Creates the doc2vec vector embeddings for a specific configuration

In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import io
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import gzip

from multiprocessing import Process, Queue

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *
from thesis.utils.file import *

## Global variables used throughout the script

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234

In [4]:
MIN_WORD_COUNT = 100
NUM_CORES = 8

In [5]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [6]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_DICT = "validation_dict.pkl"
TEST_MATRIX = "test_matrix.pkl"
TEST_DICT = "test_dict.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"

In [7]:
root_location = "/mnt/virtual-machines/data/"
exports_location = root_location + "exported_data/"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/extended_pv_abs_desc_claims_full_chunks/"

training_preprocessed_files_prefix = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
validation_preprocessed_files_prefix = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
test_preprocessed_files_prefix = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

## Load general data required for classification

In [8]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 18.1 s, sys: 1.11 s, total: 19.3 s
Wall time: 19.3 s


In [9]:
len(training_docs_list)

1286325

In [10]:
len(validation_docs_list)

321473

In [11]:
len(test_docs_list)

401877

# Utility functions for data loading

In [12]:
VALIDATION_MINI_BATCH_SIZE = 10000
def get_extended_docs_with_inference_data_only(doc2vec_model, file_to_write, preprocessed_files_prefix, level, model_name):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation or test documents
    """

    def infer_one_doc(doc_tuple):
        # doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
        doc_id, doc_tokens = doc_tuple
        rep = doc2vec_model.infer_vector(doc_tokens)
        return (doc_id, rep)

    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write)):
        info("===== Loading inference vectors")
        inference_documents_reps = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write)))
        info("Loaded inference vectors matrix")
    else:
        inference_documents_reps = {}
        info("===== Getting vectors with inference")

        # Multi-threaded inference
#         inference_docs_iterator = ExtendedPVDocumentBatchGenerator(preprocessed_files_prefix, batch_size=None)
        inference_docs_iterator = BatchWrapper(preprocessed_files_prefix, batch_size=None, level=level, level_type=model_name)
        generator_func = inference_docs_iterator.__iter__()
        # map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
        mini_batch_size = VALIDATION_MINI_BATCH_SIZE
        batches_run = 1
        pool = ThreadPool(NUM_CORES)
        while True:
            threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
            info("Finished: {} tags".format(batches_run * mini_batch_size))
            batches_run += 1
            if threaded_reps_partial:
                # threaded_reps.extend(threaded_reps_partial)
                inference_documents_reps.update(threaded_reps_partial)
            else:
                break
                
        pool.close()
        pool.terminate()

        pickle.dump(inference_documents_reps,
                    open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, file_to_write), 'w'))

    return inference_documents_reps

In [13]:
class ExtendedPVDocumentBatchGenerator(Process):
    def __init__(self, filename_prefix, queue, batch_size=10000, start_file=0, offset=10000):
        super(ExtendedPVDocumentBatchGenerator, self).__init__()
        self.queue = queue
        self.offset = offset
        self.filename_prefix = filename_prefix
        self.files_loaded = start_file - offset

    def run(self):
        cur_file = None
        while True:
            try:
                if cur_file is None:
                    info("Loading new file for index: {}".format(str(self.files_loaded + self.offset)))
                    cur_file = io.BufferedReader(gzip.open(self.filename_prefix + str(self.files_loaded + self.offset) + '.gz'))
#                     cur_file = open(self.filename_prefix + str(self.files_loaded + self.offset))
                    self.files_loaded += self.offset
                for line in cur_file:
                    self.queue.put(line)
                cur_file.close()
                cur_file = None
            except IOError:
                self.queue.put(False, block=True, timeout=None)
                info("All files are loaded - last file: {}".format(str(self.files_loaded + self.offset)))
                return


class BatchWrapper(object):
    def __init__(self, training_preprocessed_files_prefix, buffer_size=10000, batch_size=10000, level=1, level_type=None):
        assert batch_size <= 10000 or batch_size is None
        self.level = level
        self.level_type = level_type[0] if level_type is not None else None
        self.batch_size = batch_size
        self.q = Queue(maxsize=buffer_size)
        self.p = ExtendedPVDocumentBatchGenerator(training_preprocessed_files_prefix, queue=self.q,
                                                  batch_size=batch_size, start_file=0, offset=10000)
        self.p.start()
        self.cur_data = []

    def is_correct_type(self, doc_id):
        parts = doc_id.split("_")
        len_parts = len(parts)
        if len_parts == self.level:
            if len_parts == 1:
                return True
            if len_parts == self.level and (parts[1][0] == self.level_type or self.level_type is None):
                return True
        return False

    def return_sentences(self, line):
        line_array = tuple(line.split(" "))
        doc_id = line_array[0]
        if not self.is_correct_type(doc_id):
            return False
        line_array = line_array[1:]
        len_line_array = len(line_array)
        # divide the document to batches according to the batch size
        sentences = []
        
        if self.batch_size is None:
            # dont use LabeledSentence for validation iterator
            sentences.append((doc_id, line_array))
        else:
            curr_batch_iter = 0
            while curr_batch_iter < len_line_array:
                sentences.append(LabeledSentence(words=line_array[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id]))
                curr_batch_iter += self.batch_size
        return tuple(sentences)

    def __iter__(self):
        while True:
            item = self.q.get(block=True)
            if item is False:
                self.p.terminate()
                raise StopIteration()
            else:
                sentences = self.return_sentences(item)
                if not sentences:
                    None
                else:
                    for sentence in sentences:
                        yield sentence


# Doc2vec and SVM Parameters

In [14]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

## Create the Doc2vec model and create/load the vocab

In [15]:
models = [
    (2, 'claims')
]
level, model_name = models[0]

In [16]:
info("creating/loading vocabulary for " + str(level) + ' ' + model_name + ' in ')
doc2vec_model_save_location = os.path.join(root_location,
                                           "parameter_search_doc2vec_models_recalc_" + str(level) + '_' + model_name,
                                           "full")
if not os.path.exists(doc2vec_model_save_location):
    os.makedirs(doc2vec_model_save_location)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
    os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(DOC2VEC_SIZE,
                                                                DOC2VEC_WINDOW,
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE),
                                                                str(level) + '_' + model_name
                                                                )
GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
info("FILE " + os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE, window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT,
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that for now
                dbow_words=DOC2VEC_TRAIN_WORDS,
                iter=DOC2VEC_EPOCHS)

GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model


2017-04-19 02:01:21,478 : INFO : creating/loading vocabulary for 2 claims in 
2017-04-19 02:01:21,479 : INFO : FILE /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/vocab_model/model


In [17]:
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
    
    info("Creating vocab model")
    training_docs_iterator = BatchWrapper(training_preprocessed_files_prefix, batch_size=10000, level=level,
                                          level_type=model_name)
    doc2vec_model.build_vocab(sentences=training_docs_iterator, progress_per=REPORT_VOCAB_PROGRESS)
    doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
else:
    info("Loading vocab model")
    doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model.reset_from(doc2vec_model_vocab_model)

2017-04-19 02:01:25,416 : INFO : Creating vocab model
2017-04-19 02:01:25,455 : INFO : collecting all words and their counts
2017-04-19 02:01:25,457 : INFO : Loading new file for index: 0
2017-04-19 02:01:25,466 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-04-19 02:01:47,435 : INFO : Loading new file for index: 10000
2017-04-19 02:02:08,779 : INFO : Loading new file for index: 20000
2017-04-19 02:02:32,503 : INFO : Loading new file for index: 30000
2017-04-19 02:02:56,733 : INFO : Loading new file for index: 40000
2017-04-19 02:03:20,681 : INFO : Loading new file for index: 50000
2017-04-19 02:03:44,844 : INFO : Loading new file for index: 60000
2017-04-19 02:04:08,454 : INFO : Loading new file for index: 70000
2017-04-19 02:04:32,564 : INFO : Loading new file for index: 80000
2017-04-19 02:04:56,213 : INFO : Loading new file for index: 90000
2017-04-19 02:05:19,723 : INFO : Loading new file for index: 100000
2017-04-19 02:05:20,245 : INFO : PROG

2017-04-19 02:37:32,967 : INFO : PROGRESS: at example #1000000, processed 1082895120 words (546059/s), 1535141 word types, 999495 tags
2017-04-19 02:37:33,362 : INFO : Loading new file for index: 1000000
2017-04-19 02:37:53,204 : INFO : Loading new file for index: 1010000
2017-04-19 02:38:13,986 : INFO : Loading new file for index: 1020000
2017-04-19 02:38:34,521 : INFO : Loading new file for index: 1030000
2017-04-19 02:38:55,123 : INFO : Loading new file for index: 1040000
2017-04-19 02:39:16,304 : INFO : Loading new file for index: 1050000
2017-04-19 02:39:36,095 : INFO : Loading new file for index: 1060000
2017-04-19 02:39:57,258 : INFO : Loading new file for index: 1070000
2017-04-19 02:40:16,749 : INFO : Loading new file for index: 1080000
2017-04-19 02:40:36,479 : INFO : Loading new file for index: 1090000
2017-04-19 02:40:56,435 : INFO : PROGRESS: at example #1100000, processed 1190930945 words (530974/s), 1638926 word types, 1099439 tags
2017-04-19 02:40:56,972 : INFO : Loadin

## Actual Training, validation and Metrics Loop

In [18]:
doc2vec_model.min_alpha = 0.025
DOC2VEC_ALPHA_DECREASE = 0.001

In [19]:
doc2vec_model.workers = NUM_CORES

In [20]:
%%time
# when resuming, resume from an epoch with a previously created doc2vec model to get the learning rate right
start_from = 1
for epoch in range(start_from, DOC2VEC_MAX_EPOCHS+1):
    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))
    
    # if we have the model, just load it, otherwise train the previous model
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        doc2vec_model.workers = NUM_CORES
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
    else:
        # train the doc2vec model
        training_docs_iterator = BatchWrapper(training_preprocessed_files_prefix, batch_size=10000, level=level,
                                          level_type=model_name)
        %time doc2vec_model.train(sentences=training_docs_iterator, report_delay=REPORT_DELAY)
        doc2vec_model.alpha -= DOC2VEC_ALPHA_DECREASE  # decrease the learning rate
        doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
        ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
        doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
        
    # only do the inference for higher epochs, as inference usually takes as much time as the actual training
    if epoch == 3 or epoch == 5:
        # Validation Embeddings
        info('Getting Validation Embeddings')
        Xv = get_extended_docs_with_inference_data_only(doc2vec_model, VALIDATION_DICT, 
                                         validation_preprocessed_files_prefix, level, model_name)

2017-04-19 02:47:36,132 : INFO : ****************** Epoch 1 --- Working on doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1 *******************
2017-04-19 02:47:36,205 : INFO : training model with 8 workers on 64496 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=10 window=2
2017-04-19 02:47:36,207 : INFO : expecting 1286959 sentences, matching count from corpus used for vocabulary survey
2017-04-19 02:47:36,206 : INFO : Loading new file for index: 0
2017-04-19 02:47:37,245 : INFO : PROGRESS: at 0.02% examples, 198512 words/s, in_qsize 0, out_qsize 0
2017-04-19 02:47:57,267 : INFO : PROGRESS: at 0.53% examples, 226068 words/s, in_qsize 0, out_qsize 0
2017-04-19 02:48:05,374 : INFO : Loading new file for index: 10000
2017-04-19 02:48:17,289 : INFO : PROGRESS: at 1.07% examples, 232515 words/s, in_qsize 0, out_qsize 0
2017-04-19 02:48:34,829 : INFO : Loading new file for index: 20000
2017-04-19 02:48:37,311 : I

2017-04-19 03:05:06,626 : INFO : Loading new file for index: 350000
2017-04-19 03:05:18,153 : INFO : PROGRESS: at 27.46% examples, 231337 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:05:38,180 : INFO : PROGRESS: at 27.94% examples, 230994 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:05:39,316 : INFO : Loading new file for index: 360000
2017-04-19 03:05:58,213 : INFO : PROGRESS: at 28.43% examples, 230749 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:06:11,847 : INFO : Loading new file for index: 370000
2017-04-19 03:06:18,235 : INFO : PROGRESS: at 28.90% examples, 230369 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:06:38,241 : INFO : PROGRESS: at 29.41% examples, 230431 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:06:42,959 : INFO : Loading new file for index: 380000
2017-04-19 03:06:58,242 : INFO : PROGRESS: at 29.89% examples, 230195 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:07:14,509 : INFO : Loading new file for index: 390000
2017-04-19 03:07:18,252 : INFO : PROGRESS:

2017-04-19 03:25:01,180 : INFO : Loading new file for index: 650000
2017-04-19 03:25:19,399 : INFO : PROGRESS: at 50.80% examples, 200944 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:25:39,412 : INFO : PROGRESS: at 51.13% examples, 200457 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:25:48,899 : INFO : Loading new file for index: 660000
2017-04-19 03:25:59,425 : INFO : PROGRESS: at 51.46% examples, 199997 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:26:19,498 : INFO : PROGRESS: at 51.80% examples, 199573 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:26:36,021 : INFO : Loading new file for index: 670000
2017-04-19 03:26:39,518 : INFO : PROGRESS: at 52.12% examples, 199133 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:26:59,518 : INFO : PROGRESS: at 52.45% examples, 198692 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:27:19,552 : INFO : PROGRESS: at 52.78% examples, 198245 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:27:23,639 : INFO : Loading new file for index: 680000
2017-04

2017-04-19 03:46:01,217 : INFO : PROGRESS: at 71.27% examples, 181893 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:46:14,450 : INFO : Loading new file for index: 920000
2017-04-19 03:46:21,248 : INFO : PROGRESS: at 71.61% examples, 181698 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:46:41,257 : INFO : PROGRESS: at 71.95% examples, 181508 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:47:00,780 : INFO : Loading new file for index: 930000
2017-04-19 03:47:01,293 : INFO : PROGRESS: at 72.28% examples, 181334 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:47:21,323 : INFO : PROGRESS: at 72.62% examples, 181147 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:47:41,336 : INFO : PROGRESS: at 72.96% examples, 180975 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:47:46,765 : INFO : Loading new file for index: 940000
2017-04-19 03:48:01,347 : INFO : PROGRESS: at 73.30% examples, 180818 words/s, in_qsize 0, out_qsize 0
2017-04-19 03:48:21,354 : INFO : PROGRESS: at 73.63% examples, 180663 words

2017-04-19 04:06:42,506 : INFO : PROGRESS: at 92.03% examples, 173344 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:07:02,564 : INFO : PROGRESS: at 92.37% examples, 173237 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:07:10,120 : INFO : Loading new file for index: 1190000
2017-04-19 04:07:22,595 : INFO : PROGRESS: at 92.70% examples, 173138 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:07:42,617 : INFO : PROGRESS: at 93.04% examples, 173053 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:07:55,789 : INFO : Loading new file for index: 1200000
2017-04-19 04:08:02,636 : INFO : PROGRESS: at 93.39% examples, 172979 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:08:22,655 : INFO : PROGRESS: at 93.71% examples, 172861 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:08:42,328 : INFO : Loading new file for index: 1210000
2017-04-19 04:08:42,689 : INFO : PROGRESS: at 94.05% examples, 172758 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:09:02,703 : INFO : PROGRESS: at 94.38% examples, 172661 wo

CPU times: user 2h 52min 41s, sys: 6min 17s, total: 2h 58min 59s
Wall time: 1h 26min 56s


2017-04-19 04:14:33,813 : INFO : not storing attribute syn0norm
2017-04-19 04:14:33,814 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1/model.wv.syn0.npy
2017-04-19 04:14:33,852 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1/model.syn1neg.npy
2017-04-19 04:14:33,898 : INFO : not storing attribute cum_table
2017-04-19 04:14:47,829 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1/model
2017-04-19 04:14:47,830 : INFO : ****************** Epoch 2 --- Working on doc2vec_size_200_w_2_type_dm

2017-04-19 04:32:10,517 : INFO : PROGRESS: at 17.44% examples, 149782 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:32:30,552 : INFO : PROGRESS: at 17.79% examples, 149882 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:32:34,685 : INFO : Loading new file for index: 230000
2017-04-19 04:32:50,570 : INFO : PROGRESS: at 18.13% examples, 149910 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:33:10,599 : INFO : PROGRESS: at 18.46% examples, 149931 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:33:20,838 : INFO : Loading new file for index: 240000
2017-04-19 04:33:30,652 : INFO : PROGRESS: at 18.79% examples, 149864 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:33:50,684 : INFO : PROGRESS: at 19.13% examples, 149925 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:34:07,209 : INFO : Loading new file for index: 250000
2017-04-19 04:34:10,686 : INFO : PROGRESS: at 19.47% examples, 149926 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:34:30,741 : INFO : PROGRESS: at 19.80% examples, 149892 words

2017-04-19 04:52:51,997 : INFO : PROGRESS: at 38.46% examples, 150661 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:53:12,006 : INFO : PROGRESS: at 38.80% examples, 150671 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:53:14,642 : INFO : Loading new file for index: 500000
2017-04-19 04:53:32,035 : INFO : PROGRESS: at 39.15% examples, 150703 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:53:52,061 : INFO : PROGRESS: at 39.49% examples, 150710 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:54:00,433 : INFO : Loading new file for index: 510000
2017-04-19 04:54:12,072 : INFO : PROGRESS: at 39.82% examples, 150684 words/s, in_qsize 1, out_qsize 0
2017-04-19 04:54:32,102 : INFO : PROGRESS: at 40.16% examples, 150690 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:54:45,760 : INFO : Loading new file for index: 520000
2017-04-19 04:54:52,111 : INFO : PROGRESS: at 40.51% examples, 150723 words/s, in_qsize 0, out_qsize 0
2017-04-19 04:55:12,117 : INFO : PROGRESS: at 40.86% examples, 150747 words

2017-04-19 05:13:33,442 : INFO : PROGRESS: at 59.55% examples, 151148 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:13:50,827 : INFO : Loading new file for index: 770000
2017-04-19 05:13:53,508 : INFO : PROGRESS: at 59.88% examples, 151150 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:14:13,560 : INFO : PROGRESS: at 60.23% examples, 151143 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:14:33,619 : INFO : PROGRESS: at 60.57% examples, 151153 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:14:36,473 : INFO : Loading new file for index: 780000
2017-04-19 05:14:53,653 : INFO : PROGRESS: at 60.90% examples, 151142 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:15:13,713 : INFO : PROGRESS: at 61.23% examples, 151132 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:15:23,697 : INFO : Loading new file for index: 790000
2017-04-19 05:15:33,777 : INFO : PROGRESS: at 61.57% examples, 151139 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:15:53,785 : INFO : PROGRESS: at 61.92% examples, 151158 words

2017-04-19 05:34:15,004 : INFO : PROGRESS: at 80.65% examples, 151356 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:34:25,101 : INFO : Loading new file for index: 1040000
2017-04-19 05:34:35,037 : INFO : PROGRESS: at 81.00% examples, 151363 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:34:55,041 : INFO : PROGRESS: at 81.34% examples, 151356 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:35:12,269 : INFO : Loading new file for index: 1050000
2017-04-19 05:35:15,065 : INFO : PROGRESS: at 81.65% examples, 151293 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:35:35,074 : INFO : PROGRESS: at 81.99% examples, 151280 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:35:55,121 : INFO : PROGRESS: at 82.33% examples, 151279 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:35:58,469 : INFO : Loading new file for index: 1060000
2017-04-19 05:36:15,126 : INFO : PROGRESS: at 82.67% examples, 151282 words/s, in_qsize 0, out_qsize 0
2017-04-19 05:36:35,209 : INFO : PROGRESS: at 83.02% examples, 151298 wo

2017-04-19 05:53:04,897 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-19 05:53:04,903 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-19 05:53:04,904 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-19 05:53:04,905 : INFO : training on 1393295375 raw words (894188102 effective words) took 5897.0s, 151635 effective words/s
2017-04-19 05:53:04,906 : INFO : saving Doc2Vec object under /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_2/model, separately None
2017-04-19 05:53:04,907 : INFO : storing np array 'doctag_syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_2/model.docvecs.doctag_syn0.npy


CPU times: user 3h 6min 39s, sys: 7min 2s, total: 3h 13min 41s
Wall time: 1h 38min 16s


2017-04-19 05:53:05,782 : INFO : not storing attribute syn0norm
2017-04-19 05:53:05,783 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_2/model.wv.syn0.npy
2017-04-19 05:53:05,822 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_2/model.syn1neg.npy
2017-04-19 05:53:05,866 : INFO : not storing attribute cum_table
2017-04-19 05:53:17,469 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_2/model
2017-04-19 05:53:17,471 : INFO : ****************** Epoch 3 --- Working on doc2vec_size_200_w_2_type_dm

2017-04-19 06:10:39,967 : INFO : PROGRESS: at 17.72% examples, 152253 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:10:47,835 : INFO : Loading new file for index: 230000
2017-04-19 06:10:59,969 : INFO : PROGRESS: at 18.06% examples, 152215 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:11:19,976 : INFO : PROGRESS: at 18.39% examples, 152119 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:11:34,515 : INFO : Loading new file for index: 240000
2017-04-19 06:11:40,006 : INFO : PROGRESS: at 18.72% examples, 152060 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:12:00,023 : INFO : PROGRESS: at 19.07% examples, 152088 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:12:20,078 : INFO : PROGRESS: at 19.41% examples, 152097 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:12:20,456 : INFO : Loading new file for index: 250000
2017-04-19 06:12:40,099 : INFO : PROGRESS: at 19.74% examples, 152014 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:13:00,106 : INFO : PROGRESS: at 20.08% examples, 152063 words

2017-04-19 06:31:21,533 : INFO : PROGRESS: at 38.83% examples, 152115 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:31:22,393 : INFO : Loading new file for index: 500000
2017-04-19 06:31:41,586 : INFO : PROGRESS: at 39.18% examples, 152134 words/s, in_qsize 1, out_qsize 0
2017-04-19 06:32:01,602 : INFO : PROGRESS: at 39.52% examples, 152127 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:32:08,180 : INFO : Loading new file for index: 510000
2017-04-19 06:32:21,653 : INFO : PROGRESS: at 39.85% examples, 152086 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:32:41,660 : INFO : PROGRESS: at 40.20% examples, 152093 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:32:53,249 : INFO : Loading new file for index: 520000
2017-04-19 06:33:01,700 : INFO : PROGRESS: at 40.55% examples, 152117 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:33:21,711 : INFO : PROGRESS: at 40.89% examples, 152127 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:33:38,225 : INFO : Loading new file for index: 530000
2017-04

2017-04-19 06:51:51,920 : INFO : Loading new file for index: 770000
2017-04-19 06:52:03,166 : INFO : PROGRESS: at 60.04% examples, 152387 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:52:23,185 : INFO : PROGRESS: at 60.38% examples, 152383 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:52:36,607 : INFO : Loading new file for index: 780000
2017-04-19 06:52:43,244 : INFO : PROGRESS: at 60.72% examples, 152389 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:53:03,258 : INFO : PROGRESS: at 61.06% examples, 152378 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:53:23,325 : INFO : PROGRESS: at 61.38% examples, 152349 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:53:23,894 : INFO : Loading new file for index: 790000
2017-04-19 06:53:43,341 : INFO : PROGRESS: at 61.73% examples, 152358 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:54:03,378 : INFO : PROGRESS: at 62.07% examples, 152340 words/s, in_qsize 0, out_qsize 0
2017-04-19 06:54:09,487 : INFO : Loading new file for index: 800000
2017-04

2017-04-19 07:12:44,438 : INFO : PROGRESS: at 81.20% examples, 152361 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:13:04,450 : INFO : PROGRESS: at 81.50% examples, 152281 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:13:10,466 : INFO : Loading new file for index: 1050000
2017-04-19 07:13:24,508 : INFO : PROGRESS: at 81.85% examples, 152285 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:13:44,517 : INFO : PROGRESS: at 82.19% examples, 152288 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:13:56,064 : INFO : Loading new file for index: 1060000
2017-04-19 07:14:04,552 : INFO : PROGRESS: at 82.53% examples, 152277 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:14:24,619 : INFO : PROGRESS: at 82.88% examples, 152280 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:14:40,734 : INFO : Loading new file for index: 1070000
2017-04-19 07:14:44,664 : INFO : PROGRESS: at 83.23% examples, 152297 words/s, in_qsize 0, out_qsize 0
2017-04-19 07:15:04,721 : INFO : PROGRESS: at 83.58% examples, 152322 wo

2017-04-19 07:31:02,960 : INFO : training on 1393295375 raw words (894189356 effective words) took 5865.4s, 152452 effective words/s
2017-04-19 07:31:02,961 : INFO : saving Doc2Vec object under /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_3/model, separately None
2017-04-19 07:31:02,962 : INFO : storing np array 'doctag_syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_3/model.docvecs.doctag_syn0.npy


CPU times: user 3h 4min 30s, sys: 7min, total: 3h 11min 30s
Wall time: 1h 37min 45s


2017-04-19 07:31:03,826 : INFO : not storing attribute syn0norm
2017-04-19 07:31:03,828 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_3/model.wv.syn0.npy
2017-04-19 07:31:03,877 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_3/model.syn1neg.npy
2017-04-19 07:31:03,912 : INFO : not storing attribute cum_table
2017-04-19 07:31:17,568 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_3/model
2017-04-19 07:31:17,570 : INFO : Getting Validation Embeddings
2017-04-19 07:31:17,571 : INFO : ===== G

2017-04-19 08:10:33,861 : INFO : PROGRESS: at 7.38% examples, 182944 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:10:51,636 : INFO : Loading new file for index: 100000
2017-04-19 08:10:53,895 : INFO : PROGRESS: at 7.80% examples, 183146 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:11:13,940 : INFO : PROGRESS: at 8.20% examples, 183017 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:11:29,978 : INFO : Loading new file for index: 110000
2017-04-19 08:11:33,967 : INFO : PROGRESS: at 8.61% examples, 182854 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:11:54,008 : INFO : PROGRESS: at 9.02% examples, 182918 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:12:08,360 : INFO : Loading new file for index: 120000
2017-04-19 08:12:14,012 : INFO : PROGRESS: at 9.42% examples, 182867 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:12:34,046 : INFO : PROGRESS: at 9.83% examples, 182896 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:12:46,064 : INFO : Loading new file for index: 130000
2017-04-19 08:

2017-04-19 08:30:22,371 : INFO : Loading new file for index: 410000
2017-04-19 08:30:35,095 : INFO : PROGRESS: at 32.12% examples, 183963 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:30:55,110 : INFO : PROGRESS: at 32.53% examples, 183978 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:30:59,766 : INFO : Loading new file for index: 420000
2017-04-19 08:31:15,112 : INFO : PROGRESS: at 32.94% examples, 183938 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:31:35,142 : INFO : PROGRESS: at 33.35% examples, 183900 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:31:37,780 : INFO : Loading new file for index: 430000
2017-04-19 08:31:55,162 : INFO : PROGRESS: at 33.76% examples, 183880 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:32:15,184 : INFO : PROGRESS: at 34.18% examples, 183920 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:32:15,424 : INFO : Loading new file for index: 440000
2017-04-19 08:32:35,195 : INFO : PROGRESS: at 34.60% examples, 183951 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 08:50:16,262 : INFO : PROGRESS: at 56.53% examples, 184407 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:50:25,647 : INFO : Loading new file for index: 730000
2017-04-19 08:50:36,270 : INFO : PROGRESS: at 56.95% examples, 184402 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:50:56,310 : INFO : PROGRESS: at 57.36% examples, 184398 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:51:03,196 : INFO : Loading new file for index: 740000
2017-04-19 08:51:16,335 : INFO : PROGRESS: at 57.77% examples, 184380 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:51:36,368 : INFO : PROGRESS: at 58.18% examples, 184365 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:51:40,899 : INFO : Loading new file for index: 750000
2017-04-19 08:51:56,374 : INFO : PROGRESS: at 58.61% examples, 184404 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:52:16,421 : INFO : PROGRESS: at 59.01% examples, 184366 words/s, in_qsize 0, out_qsize 0
2017-04-19 08:52:18,782 : INFO : Loading new file for index: 760000
2017-04

2017-04-19 09:09:57,614 : INFO : PROGRESS: at 81.05% examples, 184721 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:10:17,638 : INFO : PROGRESS: at 81.43% examples, 184612 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:10:26,056 : INFO : Loading new file for index: 1050000
2017-04-19 09:10:37,649 : INFO : PROGRESS: at 81.85% examples, 184612 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:10:57,659 : INFO : PROGRESS: at 82.25% examples, 184588 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:11:04,315 : INFO : Loading new file for index: 1060000
2017-04-19 09:11:17,680 : INFO : PROGRESS: at 82.65% examples, 184565 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:11:37,717 : INFO : PROGRESS: at 83.08% examples, 184581 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:11:41,588 : INFO : Loading new file for index: 1070000
2017-04-19 09:11:57,727 : INFO : PROGRESS: at 83.51% examples, 184612 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:12:17,730 : INFO : PROGRESS: at 83.93% examples, 184621 wo

CPU times: user 3h 2min 6s, sys: 6min 29s, total: 3h 8min 36s
Wall time: 1h 20min 39s


2017-04-19 09:25:12,504 : INFO : not storing attribute syn0norm
2017-04-19 09:25:12,505 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_4/model.wv.syn0.npy
2017-04-19 09:25:12,544 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_4/model.syn1neg.npy
2017-04-19 09:25:12,578 : INFO : not storing attribute cum_table
2017-04-19 09:25:24,479 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_4/model
2017-04-19 09:25:24,481 : INFO : ****************** Epoch 5 --- Working on doc2vec_size_200_w_2_type_dm

2017-04-19 09:41:52,120 : INFO : Loading new file for index: 260000
2017-04-19 09:42:06,632 : INFO : PROGRESS: at 20.49% examples, 183132 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:42:26,662 : INFO : PROGRESS: at 20.91% examples, 183254 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:42:29,674 : INFO : Loading new file for index: 270000
2017-04-19 09:42:46,683 : INFO : PROGRESS: at 21.31% examples, 183247 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:43:06,753 : INFO : PROGRESS: at 21.72% examples, 183191 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:43:08,101 : INFO : Loading new file for index: 280000
2017-04-19 09:43:26,759 : INFO : PROGRESS: at 22.13% examples, 183170 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:43:45,244 : INFO : Loading new file for index: 290000
2017-04-19 09:43:46,792 : INFO : PROGRESS: at 22.55% examples, 183222 words/s, in_qsize 0, out_qsize 0
2017-04-19 09:44:06,818 : INFO : PROGRESS: at 22.96% examples, 183216 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 10:01:47,972 : INFO : PROGRESS: at 44.99% examples, 184364 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:01:51,695 : INFO : Loading new file for index: 580000
2017-04-19 10:02:07,992 : INFO : PROGRESS: at 45.40% examples, 184382 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:02:28,018 : INFO : PROGRESS: at 45.82% examples, 184374 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:02:29,018 : INFO : Loading new file for index: 590000
2017-04-19 10:02:48,068 : INFO : PROGRESS: at 46.24% examples, 184390 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:03:06,775 : INFO : Loading new file for index: 600000
2017-04-19 10:03:08,075 : INFO : PROGRESS: at 46.65% examples, 184419 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:03:28,097 : INFO : PROGRESS: at 47.07% examples, 184484 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:03:43,831 : INFO : Loading new file for index: 610000
2017-04-19 10:03:48,108 : INFO : PROGRESS: at 47.49% examples, 184502 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 10:21:29,104 : INFO : PROGRESS: at 69.42% examples, 184558 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:21:49,145 : INFO : PROGRESS: at 69.83% examples, 184563 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:21:54,431 : INFO : Loading new file for index: 900000
2017-04-19 10:22:09,158 : INFO : PROGRESS: at 70.25% examples, 184566 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:22:29,194 : INFO : PROGRESS: at 70.67% examples, 184583 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:22:31,685 : INFO : Loading new file for index: 910000
2017-04-19 10:22:49,202 : INFO : PROGRESS: at 71.08% examples, 184604 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:23:08,777 : INFO : Loading new file for index: 920000
2017-04-19 10:23:09,209 : INFO : PROGRESS: at 71.51% examples, 184608 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:23:29,240 : INFO : PROGRESS: at 71.92% examples, 184580 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:23:46,244 : INFO : Loading new file for index: 930000
2017-04

2017-04-19 10:41:10,595 : INFO : PROGRESS: at 93.88% examples, 184606 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:41:18,352 : INFO : Loading new file for index: 1210000
2017-04-19 10:41:30,629 : INFO : PROGRESS: at 94.29% examples, 184611 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:41:50,642 : INFO : PROGRESS: at 94.70% examples, 184594 words/s, in_qsize 0, out_qsize 1
2017-04-19 10:41:56,469 : INFO : Loading new file for index: 1220000
2017-04-19 10:42:10,645 : INFO : PROGRESS: at 95.11% examples, 184609 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:42:30,652 : INFO : PROGRESS: at 95.53% examples, 184610 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:42:33,912 : INFO : Loading new file for index: 1230000
2017-04-19 10:42:50,654 : INFO : PROGRESS: at 95.94% examples, 184638 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:43:10,677 : INFO : PROGRESS: at 96.37% examples, 184677 words/s, in_qsize 0, out_qsize 0
2017-04-19 10:43:10,756 : INFO : Loading new file for index: 1240000
201

CPU times: user 3h 29s, sys: 6min 32s, total: 3h 7min 1s
Wall time: 1h 20min 39s


2017-04-19 10:46:04,757 : INFO : not storing attribute syn0norm
2017-04-19 10:46:04,758 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_5/model.wv.syn0.npy
2017-04-19 10:46:04,808 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_5/model.syn1neg.npy
2017-04-19 10:46:04,845 : INFO : not storing attribute cum_table
2017-04-19 10:46:16,679 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_5/model
2017-04-19 10:46:16,681 : INFO : Getting Validation Embeddings
2017-04-19 10:46:16,682 : INFO : ===== G

2017-04-19 11:25:13,163 : INFO : Loading new file for index: 100000
2017-04-19 11:25:13,662 : INFO : PROGRESS: at 7.76% examples, 192315 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:25:33,685 : INFO : PROGRESS: at 8.19% examples, 192136 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:25:49,938 : INFO : Loading new file for index: 110000
2017-04-19 11:25:53,717 : INFO : PROGRESS: at 8.61% examples, 191890 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:26:13,751 : INFO : PROGRESS: at 9.03% examples, 191800 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:26:26,906 : INFO : Loading new file for index: 120000
2017-04-19 11:26:33,761 : INFO : PROGRESS: at 9.45% examples, 191751 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:26:53,782 : INFO : PROGRESS: at 9.88% examples, 191778 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:27:02,967 : INFO : Loading new file for index: 130000
2017-04-19 11:27:13,834 : INFO : PROGRESS: at 10.32% examples, 191860 words/s, in_qsize 0, out_qsize 0
2017-04-19 11

2017-04-19 11:44:40,141 : INFO : Loading new file for index: 420000
2017-04-19 11:44:55,008 : INFO : PROGRESS: at 32.94% examples, 191087 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:45:15,031 : INFO : PROGRESS: at 33.38% examples, 191072 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:45:16,424 : INFO : Loading new file for index: 430000
2017-04-19 11:45:35,095 : INFO : PROGRESS: at 33.81% examples, 191062 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:45:52,573 : INFO : Loading new file for index: 440000
2017-04-19 11:45:55,103 : INFO : PROGRESS: at 34.23% examples, 191081 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:46:15,116 : INFO : PROGRESS: at 34.68% examples, 191141 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:46:28,302 : INFO : Loading new file for index: 450000
2017-04-19 11:46:35,128 : INFO : PROGRESS: at 35.11% examples, 191160 words/s, in_qsize 0, out_qsize 0
2017-04-19 11:46:55,133 : INFO : PROGRESS: at 35.53% examples, 191140 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 12:04:17,613 : INFO : Loading new file for index: 740000
2017-04-19 12:04:36,195 : INFO : PROGRESS: at 57.87% examples, 190125 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:04:50,934 : INFO : Loading new file for index: 750000
2017-04-19 12:04:56,228 : INFO : PROGRESS: at 58.43% examples, 190539 words/s, in_qsize 0, out_qsize 1
2017-04-19 12:05:16,249 : INFO : PROGRESS: at 58.84% examples, 190491 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:05:27,336 : INFO : Loading new file for index: 760000
2017-04-19 12:05:36,256 : INFO : PROGRESS: at 59.24% examples, 190406 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:05:56,280 : INFO : PROGRESS: at 59.63% examples, 190310 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:06:06,901 : INFO : Loading new file for index: 770000
2017-04-19 12:06:16,290 : INFO : PROGRESS: at 60.02% examples, 190191 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:06:36,295 : INFO : PROGRESS: at 60.42% examples, 190114 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 12:24:05,815 : INFO : Loading new file for index: 1060000
2017-04-19 12:24:17,680 : INFO : PROGRESS: at 82.62% examples, 189193 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:24:37,710 : INFO : PROGRESS: at 83.03% examples, 189152 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:24:44,349 : INFO : Loading new file for index: 1070000
2017-04-19 12:24:57,751 : INFO : PROGRESS: at 83.43% examples, 189102 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:25:17,766 : INFO : PROGRESS: at 83.84% examples, 189058 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:25:22,458 : INFO : Loading new file for index: 1080000
2017-04-19 12:25:37,866 : INFO : PROGRESS: at 84.24% examples, 189025 words/s, in_qsize 1, out_qsize 0
2017-04-19 12:25:57,924 : INFO : PROGRESS: at 84.63% examples, 188921 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:26:02,069 : INFO : Loading new file for index: 1090000
2017-04-19 12:26:17,942 : INFO : PROGRESS: at 85.06% examples, 188948 words/s, in_qsize 0, out_qsize 0
201

CPU times: user 2h 58min 9s, sys: 4min 5s, total: 3h 2min 14s
Wall time: 1h 18min 56s


2017-04-19 12:38:09,101 : INFO : not storing attribute syn0norm
2017-04-19 12:38:09,101 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_6/model.wv.syn0.npy
2017-04-19 12:38:09,137 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_6/model.syn1neg.npy
2017-04-19 12:38:09,168 : INFO : not storing attribute cum_table
2017-04-19 12:38:21,824 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_6/model
2017-04-19 12:38:21,825 : INFO : ****************** Epoch 7 --- Working on doc2vec_size_200_w_2_type_dm

2017-04-19 12:54:44,935 : INFO : Loading new file for index: 260000
2017-04-19 12:55:03,890 : INFO : PROGRESS: at 20.55% examples, 183731 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:55:23,897 : INFO : PROGRESS: at 20.94% examples, 183592 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:55:25,304 : INFO : Loading new file for index: 270000
2017-04-19 12:55:43,918 : INFO : PROGRESS: at 21.32% examples, 183356 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:56:03,918 : INFO : PROGRESS: at 21.70% examples, 183079 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:56:06,257 : INFO : Loading new file for index: 280000
2017-04-19 12:56:23,942 : INFO : PROGRESS: at 22.10% examples, 182893 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:56:39,368 : INFO : Loading new file for index: 290000
2017-04-19 12:56:43,944 : INFO : PROGRESS: at 22.65% examples, 184043 words/s, in_qsize 0, out_qsize 0
2017-04-19 12:57:03,953 : INFO : PROGRESS: at 23.10% examples, 184285 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 13:14:45,142 : INFO : PROGRESS: at 44.81% examples, 183646 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:14:58,238 : INFO : Loading new file for index: 580000
2017-04-19 13:15:05,159 : INFO : PROGRESS: at 45.24% examples, 183748 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:15:25,194 : INFO : PROGRESS: at 45.79% examples, 184270 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:15:27,633 : INFO : Loading new file for index: 590000
2017-04-19 13:15:45,215 : INFO : PROGRESS: at 46.19% examples, 184218 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:16:05,230 : INFO : PROGRESS: at 46.57% examples, 184092 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:16:08,269 : INFO : Loading new file for index: 600000
2017-04-19 13:16:25,244 : INFO : PROGRESS: at 46.96% examples, 184050 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:16:44,771 : INFO : Loading new file for index: 610000
2017-04-19 13:16:45,247 : INFO : PROGRESS: at 47.41% examples, 184211 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 13:34:32,592 : INFO : Loading new file for index: 890000
2017-04-19 13:34:46,421 : INFO : PROGRESS: at 69.42% examples, 183480 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:35:06,435 : INFO : PROGRESS: at 69.81% examples, 183425 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:35:13,211 : INFO : Loading new file for index: 900000
2017-04-19 13:35:26,525 : INFO : PROGRESS: at 70.19% examples, 183333 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:35:46,556 : INFO : PROGRESS: at 70.59% examples, 183301 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:35:53,414 : INFO : Loading new file for index: 910000
2017-04-19 13:36:06,570 : INFO : PROGRESS: at 71.00% examples, 183314 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:36:26,606 : INFO : PROGRESS: at 71.45% examples, 183388 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:36:29,168 : INFO : Loading new file for index: 920000
2017-04-19 13:36:46,656 : INFO : PROGRESS: at 71.84% examples, 183341 words/s, in_qsize 0, out_qsize 1
2017-04

2017-04-19 13:54:22,924 : INFO : Loading new file for index: 1210000
2017-04-19 13:54:27,856 : INFO : PROGRESS: at 94.14% examples, 184321 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:54:47,870 : INFO : PROGRESS: at 94.57% examples, 184346 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:54:59,708 : INFO : Loading new file for index: 1220000
2017-04-19 13:55:07,870 : INFO : PROGRESS: at 95.00% examples, 184390 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:55:27,873 : INFO : PROGRESS: at 95.43% examples, 184427 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:55:35,597 : INFO : Loading new file for index: 1230000
2017-04-19 13:55:47,923 : INFO : PROGRESS: at 95.86% examples, 184483 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:56:07,925 : INFO : PROGRESS: at 96.30% examples, 184540 words/s, in_qsize 0, out_qsize 0
2017-04-19 13:56:11,232 : INFO : Loading new file for index: 1240000
2017-04-19 13:56:27,970 : INFO : PROGRESS: at 96.74% examples, 184592 words/s, in_qsize 0, out_qsize 0
201

CPU times: user 2h 50min 4s, sys: 4min 5s, total: 2h 54min 10s
Wall time: 1h 20min 37s


2017-04-19 13:58:59,795 : INFO : not storing attribute syn0norm
2017-04-19 13:58:59,796 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_7/model.wv.syn0.npy
2017-04-19 13:58:59,834 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_7/model.syn1neg.npy
2017-04-19 13:58:59,868 : INFO : not storing attribute cum_table
2017-04-19 13:59:11,962 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_7/model
2017-04-19 13:59:11,963 : INFO : ****************** Epoch 8 --- Working on doc2vec_size_200_w_2_type_dm

2017-04-19 14:15:33,215 : INFO : Loading new file for index: 270000
2017-04-19 14:15:33,928 : INFO : PROGRESS: at 20.98% examples, 191468 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:15:53,952 : INFO : PROGRESS: at 21.41% examples, 191454 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:16:09,956 : INFO : Loading new file for index: 280000
2017-04-19 14:16:13,978 : INFO : PROGRESS: at 21.83% examples, 191379 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:16:34,030 : INFO : PROGRESS: at 22.27% examples, 191349 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:16:45,704 : INFO : Loading new file for index: 290000
2017-04-19 14:16:54,033 : INFO : PROGRESS: at 22.70% examples, 191435 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:17:14,058 : INFO : PROGRESS: at 23.13% examples, 191348 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:17:22,139 : INFO : Loading new file for index: 300000
2017-04-19 14:17:34,063 : INFO : PROGRESS: at 23.55% examples, 191336 words/s, in_qsize 0, out_qsize 0
2017-04

2017-04-19 14:34:55,057 : INFO : PROGRESS: at 45.85% examples, 191428 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:35:15,059 : INFO : PROGRESS: at 46.28% examples, 191423 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:35:30,894 : INFO : Loading new file for index: 600000
2017-04-19 14:35:35,096 : INFO : PROGRESS: at 46.71% examples, 191488 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:35:55,104 : INFO : PROGRESS: at 47.14% examples, 191493 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:36:07,068 : INFO : Loading new file for index: 610000
2017-04-19 14:36:15,114 : INFO : PROGRESS: at 47.56% examples, 191506 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:36:35,116 : INFO : PROGRESS: at 48.00% examples, 191490 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:36:43,234 : INFO : Loading new file for index: 620000
2017-04-19 14:36:55,159 : INFO : PROGRESS: at 48.43% examples, 191528 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:37:15,182 : INFO : PROGRESS: at 48.87% examples, 191593 words

2017-04-19 14:54:36,027 : INFO : PROGRESS: at 71.02% examples, 191133 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:54:56,038 : INFO : PROGRESS: at 71.46% examples, 191142 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:54:57,777 : INFO : Loading new file for index: 920000
2017-04-19 14:55:16,060 : INFO : PROGRESS: at 71.90% examples, 191143 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:55:33,581 : INFO : Loading new file for index: 930000
2017-04-19 14:55:36,066 : INFO : PROGRESS: at 72.33% examples, 191176 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:55:56,067 : INFO : PROGRESS: at 72.77% examples, 191181 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:56:09,538 : INFO : Loading new file for index: 940000
2017-04-19 14:56:16,079 : INFO : PROGRESS: at 73.19% examples, 191173 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:56:36,101 : INFO : PROGRESS: at 73.62% examples, 191191 words/s, in_qsize 0, out_qsize 0
2017-04-19 14:56:45,421 : INFO : Loading new file for index: 950000
2017-04

2017-04-19 15:14:17,202 : INFO : PROGRESS: at 96.31% examples, 191149 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:14:20,249 : INFO : Loading new file for index: 1240000
2017-04-19 15:14:37,213 : INFO : PROGRESS: at 96.75% examples, 191167 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:14:56,152 : INFO : Loading new file for index: 1250000
2017-04-19 15:14:57,233 : INFO : PROGRESS: at 97.17% examples, 191163 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:15:17,264 : INFO : PROGRESS: at 97.61% examples, 191171 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:15:32,082 : INFO : Loading new file for index: 1260000
2017-04-19 15:15:37,278 : INFO : PROGRESS: at 98.04% examples, 191164 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:15:57,288 : INFO : PROGRESS: at 98.49% examples, 191218 words/s, in_qsize 0, out_qsize 0
2017-04-19 15:16:07,674 : INFO : Loading new file for index: 1270000
2017-04-19 15:16:17,297 : INFO : PROGRESS: at 98.92% examples, 191218 words/s, in_qsize 0, out_qsize 0
201

CPU times: user 2h 49min 45s, sys: 4min 5s, total: 2h 53min 50s
Wall time: 1h 17min 55s


2017-04-19 15:17:08,530 : INFO : not storing attribute syn0norm
2017-04-19 15:17:08,531 : INFO : storing np array 'syn0' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_8/model.wv.syn0.npy
2017-04-19 15:17:08,568 : INFO : storing np array 'syn1neg' to /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_8/model.syn1neg.npy
2017-04-19 15:17:08,602 : INFO : not storing attribute cum_table
2017-04-19 15:17:20,674 : INFO : saved /mnt/virtual-machines/data/parameter_search_doc2vec_models_recalc_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_8/model


CPU times: user 1d 4h 31min 30s, sys: 49min 33s, total: 1d 5h 21min 3s
Wall time: 12h 29min 44s


## Inference Only (if needed)

In [37]:
NUM_CORES = 32

In [38]:
epoch = 1
GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)

if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
    doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
    doc2vec_model.workers = NUM_CORES
    GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

2017-04-18 18:06:29,959 : INFO : loading Doc2Vec object from /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_description/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_description/epoch_1/model
2017-04-18 18:06:38,844 : INFO : loading docvecs recursively from /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_description/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_description/epoch_1/model.docvecs.* with mmap=None
2017-04-18 18:06:38,845 : INFO : loading doctag_syn0 from /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_description/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_description/epoch_1/model.docvecs.doctag_syn0.npy with mmap=None
2017-04-18 18:06:39,463 : INFO : loading wv recursively from /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_description/full/doc2vec_size_200_w_2_

In [39]:
GLOBAL_VARS.MODEL_NAME

'doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_description/epoch_1'

In [None]:
Xv = get_extended_docs_with_inference_data_only(doc2vec_model, VALIDATION_DICT, 
                                         validation_preprocessed_files_prefix, level, model_name)

2017-04-18 18:06:48,123 : INFO : ===== Getting vectors with inference
2017-04-18 18:06:48,350 : INFO : Loading new file for index: 0
2017-04-18 18:07:10,806 : INFO : Loading new file for index: 10000
2017-04-18 18:10:51,895 : INFO : Finished: 10000 tags
2017-04-18 18:11:11,459 : INFO : Loading new file for index: 20000
2017-04-18 18:14:42,427 : INFO : Finished: 20000 tags
2017-04-18 18:15:02,688 : INFO : Loading new file for index: 30000
2017-04-18 18:18:25,861 : INFO : Finished: 30000 tags
2017-04-18 18:18:45,916 : INFO : Loading new file for index: 40000
2017-04-18 18:22:10,180 : INFO : Finished: 40000 tags
2017-04-18 18:22:30,523 : INFO : Loading new file for index: 50000
2017-04-18 18:25:47,360 : INFO : Finished: 50000 tags
2017-04-18 18:26:07,038 : INFO : Loading new file for index: 60000
2017-04-18 18:29:19,960 : INFO : Finished: 60000 tags
2017-04-18 18:29:40,442 : INFO : Loading new file for index: 70000
2017-04-18 18:32:57,714 : INFO : Finished: 70000 tags
2017-04-18 18:33:16,

#### Testing inference

In [55]:
inference_docs_iterator = BatchWrapper(validation_preprocessed_files_prefix, batch_size=None, level=level, level_type=model_name)        
for doc_tuple in inference_doczs_iterator:
    doc_id, doc_tokens = doc_tuple
    rep = doc2vec_model.infer_vector(doc_tokens)
    print (doc_id, rep)
    break

('08521002', array([  1.13558674e+00,  -2.01971769e-01,  -9.30447519e-01,
         9.55632687e-01,   5.11517346e-01,   4.34441900e+00,
        -3.77764761e-01,  -1.11617422e+00,  -2.15896085e-01,
         9.09354746e-01,   5.74674904e-01,  -2.07049704e+00,
        -7.20400810e-01,   4.94136661e-01,  -1.74060893e+00,
        -2.17272949e+00,  -4.39270258e-01,  -1.51936769e+00,
         5.65607429e-01,  -4.58835810e-01,  -1.69598356e-01,
         1.77733886e+00,   3.66123140e-01,   1.38953611e-01,
        -1.04259264e+00,   8.84979665e-01,  -8.56729895e-02,
        -6.04329109e-01,   4.42179322e-01,   1.08561194e+00,
        -2.49654725e-01,   3.02951038e-01,  -3.80307257e-01,
         1.32433748e+00,   7.18038738e-01,   7.99864233e-01,
        -3.60305488e-01,  -3.32749695e-01,   1.86409019e-02,
        -1.16298962e+00,  -2.36521304e-01,   8.52507114e-01,
        -4.25269688e-03,  -2.73190904e+00,  -1.39228487e+00,
         3.14658254e-01,   1.19927609e+00,  -8.86219382e-01,
        -2.

In [53]:
doc2vec_model.wv.syn0

array([[ 0.24962339,  0.03808838, -0.38492572, ...,  0.81019139,
        -0.0872335 ,  0.00503489],
       [-0.24393913, -0.9072656 , -0.08245134, ..., -0.12438237,
        -0.10501056,  0.07241193],
       [ 0.06769085, -0.22004843,  0.05649997, ...,  0.15331532,
        -0.87121236, -0.71148068],
       ..., 
       [ 0.02257917,  0.18380728, -0.19475998, ...,  0.72972393,
        -0.03356596, -0.29145467],
       [-0.20255305, -0.25994578,  0.31640032, ..., -0.02623975,
         0.41660461, -0.45980361],
       [ 1.09419656, -0.97489876, -0.3509953 , ...,  0.82430571,
         0.02756385,  0.7905944 ]], dtype=float32)