In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
IS_SAMPLE = False

In [4]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234
WORD2VEC_SEED = 1234

In [5]:
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 100
MIN_SIZE = 0
NUM_CORES = 22

In [6]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [7]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"

In [8]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"

root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

doc2vec_model_save_location = os.path.join(root_location, "parameter_search_doc2vec_models_new", "full")
if not os.path.exists(doc2vec_model_save_location):
    os.makedirs(doc2vec_model_save_location)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
    os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/"

training_preprocessed_files_prefix = preprocessed_location + "training_docs_merged_data_preprocessed-"
training_preprocessed_docids_files_prefix = preprocessed_location + "training_docs_merged_docids_preprocessed-"
validation_preprocessed_files_prefix = preprocessed_location + "validation_docs_merged_data_preprocessed-"
validation_preprocessed_docids_files_prefix = preprocessed_location + "validation_docs_merged_docids_preprocessed-"

word2vec_questions_file = result = root_location + 'tensorflow/word2vec/questions-words.txt'

In [9]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
#test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 30.3 s, sys: 1.45 s, total: 31.8 s
Wall time: 32.8 s


In [10]:
len(training_docs_list)

1286325

In [11]:
len(validation_docs_list)

321473

In [12]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

In [13]:
def ensure_hdfs_location_exists(location):
    parent = os.path.dirname(location)
    os.system("hdfs dfs -mkdir -p " + location)

def ensure_disk_location_exists(location):
    if not os.path.exists(location):
        os.makedirs(location)

In [14]:
def get_validation_docs_with_inference(doc2vec_model, doc_classification_map):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
    else:
        validation_documents_reps = {}
        validation_vectors = []
        validation_labels = []
        info("===== Getting validation vectors with inference")

        # do inference and store results in dict
        i = 0
        for (doc_id, doc_contents_array) in ValidationDocumentGenerator(training_file, validation_docs_list):
            i += 1
            if i % 1000 == 0: info("Finished: {}".format(str(i)))
            validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)

        # create matrix for the validation vectors
        for validation_doc_id in validation_docs_list:
            validation_vectors.append(validation_documents_reps[validation_doc_id])
            validation_labels.append([classf for classf in doc_classification_map[validation_doc_id] if classf in sections])
        validation_vectors_matrix = np.array(validation_vectors)
        pickle.dump(validation_vectors_matrix, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX), 'w'))
    
    return validation_vectors_matrix

In [15]:
def get_validation_docs_with_inference_new(doc2vec_model, doc_classification_map, classifications, 
                                           val_docs_list, val_preprocessed_files_prefix, val_preprocessed_docids_files_prefix):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """

    def infer_one_doc(doc_tuple):
        #doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
        doc_id, doc_tokens = doc_tuple
        rep = doc2vec_model.infer_vector(doc_tokens)
        return (doc_id, rep)

    one_hot_encoder = OneHotEncoder(classifications)
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_labels = []
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
        for validation_doc_id in val_docs_list:
            val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
            validation_labels.append(one_hot_encoder.get_label_vector(val_labels))
        validation_labels = np.array(validation_labels)
    else:
        validation_documents_reps = {}
        validation_vectors = []
        validation_labels = []
        info("===== Getting validation vectors with inference")

        # Single-threaded inference
        # do inference and store results in dict
#         i = 0
        
#         validation_docs_iterator = DocumentBatchGenerator(val_preprocessed_files_prefix, 
#                                                         val_preprocessed_docids_files_prefix, batch_size=None)
#         for (doc_id, doc_contents_array) in validation_docs_iterator:
#             i += 1
#             if i % 1000 == 0: info("Finished: {}".format(str(i)))
#             validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)
        
        # Multi-threaded inference
        validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                          validation_preprocessed_docids_files_prefix, batch_size=None)
        generator_func = validation_docs_iterator.__iter__()
        pool = ThreadPool(NUM_CORES)
        # map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
        validation_documents_reps = {}
        mini_batch_size = 1000
        while True:
            threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
            info("Finished: {}".format(str(validation_docs_iterator.curr_index)))
            if threaded_reps_partial:
                #threaded_reps.extend(threaded_reps_partial)
                validation_documents_reps.update(threaded_reps_partial)
            else:
                break

                
        # create matrix for the validation vectors
        for validation_doc_id in val_docs_list:
            validation_vectors.append(validation_documents_reps[validation_doc_id])
            val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
            validation_labels.append(one_hot_encoder.get_label_vector(val_labels))
        validation_vectors_matrix = np.array(validation_vectors)
        validation_labels = np.array(validation_labels)
        pickle.dump(validation_vectors_matrix, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX), 'w'))
    
    return validation_vectors_matrix, validation_labels

In [16]:
def get_validation_doc2vec_spark_vectors(validation_vectors_matrix, classification, doc_classification_map):
    validation_vectors = []
    for (index, doc_id) in enumerate(validation_docs_list):
        # converting from memmap to a normal array as spark is unable to convert memmap to a spark Vector
        validation_vector = validation_vectors_matrix[index]
        validation_vectors.append(get_training_vector(classification, validation_vector, 
                                                    doc_classification_map[doc_id]))
    validation_vectors = sc.parallelize(validation_vectors)
    info("Finished getting validation vectors")
    return validation_vectors

In [17]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_training_data(doc2vec_model, classifications):
    one_hot_encoder = OneHotEncoder(classifications)
    training_data = []
    training_labels = []
    for doc_id in training_docs_list:
        # converting from memmap to a normal array
        normal_array = []
        normal_array[:] = doc2vec_model.docvecs[doc_id][:]
        training_data.append(normal_array)
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        training_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
    training_labels = np.array(training_labels)
    return training_data, training_labels

In [18]:
class TrainingDocumentGenerator(object):
    def __init__(self, filename, training_docs_list):
        self.filename = filename
        self.training_docs_list = training_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.training_docs_list:
                    yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
                    
class DocumentBatchGenerator(object):
    def __init__(self, filename_prefix, filename_docids_prefix, batch_size=10000 ):
        """
        batch_size cant be > 10,000 due to a limitation in doc2vec training, 
        None means no batching (only use for inference)
        """
        assert batch_size <= 10000 or batch_size is None
        self.filename_prefix = filename_prefix
        self.filename_docids_prefix = filename_docids_prefix
        self.curr_lines = []
        self.curr_docids = []
        self.batch_size = batch_size
        self.curr_index = 0
        self.batch_end = -1
    def load_new_batch_in_memory(self):
        self.curr_lines, self.docids = [], []
        info("Loading new batch for index: {}".format(self.curr_index) )
        try:
            with open(self.filename_prefix + str(self.curr_index)) as preproc_file:
                for line in preproc_file:
                    self.curr_lines.append(line.split(" "))
#                     if i % 1000 == 0:
#                         print i
            self.curr_docids = pickle.load(open(self.filename_docids_prefix + str(self.curr_index), "r"))
            self.batch_end = self.curr_index + len(self.curr_lines) -1 
            info("Finished loading new batch")
        except IOError:
            info("No more batches to load, exiting at index: {}".format(self.curr_index))
            raise StopIteration()
    def __iter__(self):
        while True:
            if self.curr_index > self.batch_end:
                self.load_new_batch_in_memory()
            for (doc_id, tokens) in zip(self.curr_docids, self.curr_lines):
                if self.batch_size is not None:
                    curr_batch_iter = 0
                    # divide the document to batches according to the batch size
                    while curr_batch_iter < len(tokens):
                        yield LabeledSentence(words=tokens[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id])
                        curr_batch_iter += self.batch_size
                else:
                    yield doc_id, tokens
                self.curr_index += 1

class Word2VecTrainingDocumentGenerator(object):
    def __init__(self, filename, training_docs_list):
        self.filename = filename
        self.training_docs_list = training_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.training_docs_list:
                    yield stemtokenizer(text)
                
class ValidationDocumentGenerator(object):
    def __init__(self, filename, validation_docs_list):
        self.filename = filename
        self.validation_docs_list = validation_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.validation_docs_list:
                    yield doc_id, stemtokenizer(text)
                    
class StochasticDocumentGenerator(object):
    """
    Randomly shuffle rows while reading them
    """
    def __init__(self, filename, training_docs_list, line_positions):
        self.filename = filename
        self.training_docs_list = training_docs_list
        self.line_positions = line_positions
        self.lines = set(line_positions.keys())
    def __iter__(self):
        with open(self.filename) as file_obj:
            while len(self.lines) > 0:
                random_line = random.sample(self.lines,1)[0]
                self.lines.remove(random_line)
                file_obj.seek(self.line_positions[random_line])
                line = file_obj.readline()
                if not line.strip(): continue
#                 print random_line, self.line_positions[random_line], line[:30]
                (doc_id, text) = eval(line)
                # print random_line , doc_id
                if doc_id in self.training_docs_list:
                    yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
#                     yield doc_id

### Doc2vec and SVM Parameters

In [19]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 20
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 10000 # report vocab progress every x documents

In [20]:
SVM_ITERATIONS = 10
SVM_CONVERGENCE = 0.001
SVM_REG = 0.001
SVM_CLASS_WEIGHTS = 'balanced'
GLOBAL_VARS.SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))

In [21]:
def get_svm_model_path(method, classification, reg=SVM_REG, iterations=SVM_ITERATIONS):
    location = os.path.join(save_parent_location, "models", method, 
                            "iter_" + str(iterations) + "_reg_" + str(reg),
                            classification + "_model.svm")
    ensure_hdfs_location_exists(location)
    return location

## Create the Doc2vec model

In [22]:
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE, 
                                                                DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE))
GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
placeholder_model_name

'doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_{}'

In [23]:
doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that for now
                dbow_words=DOC2VEC_TRAIN_WORDS,
                iter=DOC2VEC_EPOCHS)

GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

In [24]:
%%time
training_docs_iterator = DocumentBatchGenerator(training_preprocessed_files_prefix, 
                                                        training_preprocessed_docids_files_prefix, batch_size=10000)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
    doc2vec_model.build_vocab(sentences=training_docs_iterator, progress_per=REPORT_VOCAB_PROGRESS)
    doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
else:
    doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model.reset_from(doc2vec_model_vocab_model)

2017-01-14 21:00:10,101 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/vocab_model/model
2017-01-14 21:00:19,425 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/vocab_model/model.docvecs.* with mmap=None
2017-01-14 21:00:19,427 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/vocab_model/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 21:00:26,369 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_models_new/full/vocab_model/model.syn1neg.npy with mmap=None
2017-01-14 21:01:03,709 : INFO : loading syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/vocab_model/model.syn0.npy with mmap=None
2017-01-14 21:01:05,710 : INFO : setting ignored attribute syn0norm to None
2017-01-14 21:01:05,711 : INFO : setting ignored attribute cum_table to None
2017-01-14 21:01:07,092 : INFO : resetting layer weights


CPU times: user 1min 3s, sys: 4.18 s, total: 1min 7s
Wall time: 1min 50s


In [None]:
# vocab_counts = {k:doc2vec_model.vocab[k].count for k in doc2vec_model.vocab.keys()}
# dd = sorted(vocab_counts, key=vocab_counts.get)

## Actual Training, validation and Metrics Loop

In [25]:
doc2vec_model.min_alpha = 0.025
epoch_validation_metrics = []
epoch_training_metrics = []
epoch_word2vec_metrics = []
classifications = sections
classifications_type = 'sections'

In [26]:
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'.format(classifications_type)
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'.format(classifications_type)
METRICS_FIG_PNG_FILENAME = '{}_validation_metrics.png'.format(classifications_type)
METRICS_FIG_PDF_FILENAME = '{}_validation_metrics.pdf'.format(classifications_type)
WORD2VEC_METRICS_FILENAME = 'word2vec_metrics.pkl'

# for epoch in range(DOC2VEC_MAX_EPOCHS):
#     GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
#     ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
#                                              GLOBAL_VARS.SVM_MODEL_NAME))
#     pickle.dump(metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, GLOBAL_VARS.SVM_MODEL_NAME, METRICS), 'w'))
# fig_save_location = placeholder_model_name.format('run')
# plt.savefig(os.path.join(fig_save_location))

In [None]:
%%time
%matplotlib notebook
graph = MetricsGraph()
graph.init_graph(len(classifications) +2)
# when resuming, resume from an epoch with a previously created doc2vec model to get the learning rate right
start_from = 1
for epoch in range(start_from, DOC2VEC_MAX_EPOCHS+1):
    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))
    
    # if we have the model, just load it, otherwise train the previous model
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
    else:
        # train the doc2vec model
        training_docs_iterator = DocumentBatchGenerator(training_preprocessed_files_prefix, 
                                                        training_preprocessed_docids_files_prefix, batch_size=10000)
        doc2vec_model.train(sentences=training_docs_iterator, report_delay=REPORT_DELAY)
        doc2vec_model.alpha -= 0.001  # decrease the learning rate
        doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
        ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
        doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
        
        # get the word2vec analogy accuracy score
        word2vec_result = doc2vec_model.accuracy(word2vec_questions_file, restrict_vocab=None)
        epoch_word2vec_metrics.append(word2vec_result)
        pickle.dump(word2vec_result, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME,
                                                       WORD2VEC_METRICS_FILENAME), 'w'))

        
    info('Getting training Data')
    X, y = get_training_data(doc2vec_model, classifications)
    
    
    ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                             GLOBAL_VARS.SVM_MODEL_NAME))
    
    # try warm start and evaluate after every iter
    
    if not os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, CLASSIFIER)):
        info('Training Classifier')
        clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                             #alpha is the 1/C parameter
                                                             alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                             #n_jobs=-1 means use all cpus
                                                             shuffle=True, verbose=0, n_jobs=1,
                                                             #eta0 is the learning rate when we use constant configuration
                                                             random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                             class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)


        # Training of a classifier
        clf.fit(X,y)
        pickle.dump(clf, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                              GLOBAL_VARS.SVM_MODEL_NAME, CLASSIFIER), 'w'))
    else:
        clf = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, CLASSIFIER), 'r'))
    
    # Training Metrics
    info('Evaluating on Training Data')
    yp = clf.predict(X)
    print yp
    training_metrics = get_metrics(y, yp, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    epoch_training_metrics.append(training_metrics)
    
    
    # Validation Metrics
    info('Getting Validation Embeddings')
    Xv, yv = get_validation_docs_with_inference_new(doc2vec_model, doc_classification_map, classifications, 
                                                    validation_docs_list, validation_preprocessed_files_prefix,
                                                    validation_preprocessed_docids_files_prefix)
    info('Evaluating on Validation Data')
    yvp = clf.predict(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    graph.add_metrics_to_graph(validation_metrics, epoch)
    
    epoch_validation_metrics.append(validation_metrics)
    
    
    # Saving the metrics
    pickle.dump(training_metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, TRAINING_METRICS_FILENAME), 'w'))
    pickle.dump(validation_metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, VALIDATION_METRICS_FILENAME), 'w'))


<IPython.core.display.Javascript object>

2017-01-14 21:02:37,874 : INFO : ****************** Epoch 1 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_1 *******************
2017-01-14 21:02:37,877 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_1/model
2017-01-14 21:02:50,590 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_1/model.docvecs.* with mmap=None
2017-01-14 21:02:50,592 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_1/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 21:03:07,054 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 1 0 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 21:20:20,335 : INFO : Getting Validation Embeddings
2017-01-14 21:20:20,339 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.316, Avg Labels: 1.150, 
		 Top 1: 0.557, Top 3: 0.865, Top 5: 0.946, 
		 F1 Micro: 0.580, F1 Macro: 0.489, Total Pos: 2,684,026


2017-01-14 21:21:08,420 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 0 1]
 [1 0 0 ..., 1 1 0]
 [0 0 1 ..., 0 1 0]
 ..., 
 [1 1 0 ..., 1 0 0]
 [1 0 0 ..., 0 0 0]
 [1 1 1 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.622, Avg Labels: 1.150, 
		 Top 1: 0.486, Top 3: 0.815, Top 5: 0.932, 
		 F1 Micro: 0.510, F1 Macro: 0.429, Total Pos: 839,445


in singular transformations; automatically expanding.
left=1, right=1
  'left=%s, right=%s') % (left, right))
2017-01-14 21:21:42,256 : INFO : ****************** Epoch 2 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_2 *******************
2017-01-14 21:21:42,258 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_2/model
2017-01-14 21:22:00,507 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_2/model.docvecs.* with mmap=None
2017-01-14 21:22:00,508 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_2/model.docvecs.doctag_syn0.npy wit

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 21:39:02,729 : INFO : Getting Validation Embeddings
2017-01-14 21:39:02,732 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.114, Avg Labels: 1.150, 
		 Top 1: 0.584, Top 3: 0.889, Top 5: 0.956, 
		 F1 Micro: 0.608, F1 Macro: 0.512, Total Pos: 2,604,615


2017-01-14 21:39:59,603 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 1 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 1 0 0]]
** Validation Metrics: Cov Err: 3.262, Avg Labels: 1.150, 
		 Top 1: 0.535, Top 3: 0.858, Top 5: 0.951, 
		 F1 Micro: 0.558, F1 Macro: 0.477, Total Pos: 776,220


2017-01-14 21:40:30,448 : INFO : ****************** Epoch 3 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_3 *******************
2017-01-14 21:40:30,450 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_3/model
2017-01-14 21:40:50,599 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_3/model.docvecs.* with mmap=None
2017-01-14 21:40:50,600 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_3/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 21:41:22,651 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 21:58:13,596 : INFO : Getting Validation Embeddings
2017-01-14 21:58:13,600 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.066, Avg Labels: 1.150, 
		 Top 1: 0.591, Top 3: 0.894, Top 5: 0.958, 
		 F1 Micro: 0.615, F1 Macro: 0.518, Total Pos: 2,583,134


2017-01-14 21:58:50,956 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 1 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 1 0 0]]
** Validation Metrics: Cov Err: 3.182, Avg Labels: 1.150, 
		 Top 1: 0.540, Top 3: 0.866, Top 5: 0.952, 
		 F1 Micro: 0.571, F1 Macro: 0.489, Total Pos: 754,750


2017-01-14 21:59:21,209 : INFO : ****************** Epoch 4 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_4 *******************
2017-01-14 21:59:21,212 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_4/model
2017-01-14 21:59:37,861 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_4/model.docvecs.* with mmap=None
2017-01-14 21:59:37,862 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_4/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 21:59:47,532 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

** Training Metrics: Cov Err: 3.049, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.896, Top 5: 0.959, 
		 F1 Micro: 0.617, F1 Macro: 0.520, Total Pos: 2,578,399


2017-01-14 22:17:02,495 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 1 0 0]]
** Validation Metrics: Cov Err: 3.066, Avg Labels: 1.150, 
		 Top 1: 0.544, Top 3: 0.880, Top 5: 0.949, 
		 F1 Micro: 0.592, F1 Macro: 0.504, Total Pos: 716,595


2017-01-14 22:17:33,501 : INFO : ****************** Epoch 5 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_5 *******************
2017-01-14 22:17:33,503 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_5/model
2017-01-14 22:17:40,682 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_5/model.docvecs.* with mmap=None
2017-01-14 22:17:40,683 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_5/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 22:17:56,104 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

** Training Metrics: Cov Err: 3.045, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.896, Top 5: 0.959, 
		 F1 Micro: 0.617, F1 Macro: 0.520, Total Pos: 2,577,150


2017-01-14 22:35:09,888 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 1 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 1 0 0]]
** Validation Metrics: Cov Err: 3.049, Avg Labels: 1.150, 
		 Top 1: 0.553, Top 3: 0.880, Top 5: 0.948, 
		 F1 Micro: 0.597, F1 Macro: 0.507, Total Pos: 706,128


2017-01-14 22:35:39,783 : INFO : ****************** Epoch 6 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6 *******************
2017-01-14 22:35:39,785 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model
2017-01-14 22:35:56,622 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.docvecs.* with mmap=None
2017-01-14 22:35:56,623 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 22:36:15,582 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 22:52:38,082 : INFO : Getting Validation Embeddings
2017-01-14 22:52:38,085 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.034, Avg Labels: 1.150, 
		 Top 1: 0.595, Top 3: 0.898, Top 5: 0.960, 
		 F1 Micro: 0.619, F1 Macro: 0.522, Total Pos: 2,569,917


2017-01-14 22:53:22,008 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.016, Avg Labels: 1.150, 
		 Top 1: 0.575, Top 3: 0.889, Top 5: 0.950, 
		 F1 Micro: 0.607, F1 Macro: 0.513, Total Pos: 685,229


2017-01-14 22:53:51,873 : INFO : ****************** Epoch 7 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7 *******************
2017-01-14 22:53:51,875 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model
2017-01-14 22:54:09,323 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model.docvecs.* with mmap=None
2017-01-14 22:54:09,324 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 22:54:25,883 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 23:11:38,199 : INFO : Getting Validation Embeddings
2017-01-14 23:11:38,202 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.042, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.617, F1 Macro: 0.520, Total Pos: 2,583,032


2017-01-14 23:12:22,982 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.005, Avg Labels: 1.150, 
		 Top 1: 0.562, Top 3: 0.888, Top 5: 0.952, 
		 F1 Micro: 0.605, F1 Macro: 0.512, Total Pos: 696,311


2017-01-14 23:12:55,562 : INFO : ****************** Epoch 8 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8 *******************
2017-01-14 23:12:55,564 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8/model
2017-01-14 23:13:02,987 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8/model.docvecs.* with mmap=None
2017-01-14 23:13:02,989 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 23:13:21,151 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 23:32:22,323 : INFO : Getting Validation Embeddings
2017-01-14 23:32:22,329 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.042, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.618, F1 Macro: 0.520, Total Pos: 2,579,273


2017-01-14 23:33:09,167 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 0]
 [1 0 0 ..., 1 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [1 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [0 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 2.980, Avg Labels: 1.150, 
		 Top 1: 0.568, Top 3: 0.890, Top 5: 0.947, 
		 F1 Micro: 0.612, F1 Macro: 0.517, Total Pos: 678,429


2017-01-14 23:33:39,677 : INFO : ****************** Epoch 9 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_9 *******************
2017-01-14 23:33:39,717 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_9/model
2017-01-14 23:33:57,140 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_9/model.docvecs.* with mmap=None
2017-01-14 23:33:57,142 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_9/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 23:34:14,016 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-14 23:50:20,382 : INFO : Getting Validation Embeddings
2017-01-14 23:50:20,385 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.040, Avg Labels: 1.150, 
		 Top 1: 0.594, Top 3: 0.897, Top 5: 0.959, 
		 F1 Micro: 0.618, F1 Macro: 0.521, Total Pos: 2,573,212


2017-01-14 23:51:04,714 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 1 0 1]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.016, Avg Labels: 1.150, 
		 Top 1: 0.564, Top 3: 0.885, Top 5: 0.948, 
		 F1 Micro: 0.604, F1 Macro: 0.511, Total Pos: 693,973


2017-01-14 23:51:34,151 : INFO : ****************** Epoch 10 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_10 *******************
2017-01-14 23:51:34,153 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_10/model
2017-01-14 23:51:51,462 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_10/model.docvecs.* with mmap=None
2017-01-14 23:51:51,463 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_10/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-14 23:52:06,503 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-15 00:09:33,200 : INFO : Getting Validation Embeddings
2017-01-15 00:09:33,203 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.042, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.618, F1 Macro: 0.520, Total Pos: 2,578,565


2017-01-15 00:10:31,569 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 1 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.000, Avg Labels: 1.150, 
		 Top 1: 0.571, Top 3: 0.889, Top 5: 0.950, 
		 F1 Micro: 0.611, F1 Macro: 0.517, Total Pos: 676,024


2017-01-15 00:11:00,784 : INFO : ****************** Epoch 11 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_11 *******************
2017-01-15 00:11:00,837 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_11/model
2017-01-15 00:11:08,032 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_11/model.docvecs.* with mmap=None
2017-01-15 00:11:08,033 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_11/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-15 00:11:23,070 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-15 00:27:53,665 : INFO : Getting Validation Embeddings
2017-01-15 00:27:53,669 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.042, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.618, F1 Macro: 0.521, Total Pos: 2,575,841


2017-01-15 00:28:35,753 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.004, Avg Labels: 1.150, 
		 Top 1: 0.570, Top 3: 0.888, Top 5: 0.951, 
		 F1 Micro: 0.609, F1 Macro: 0.515, Total Pos: 681,804


2017-01-15 00:29:05,469 : INFO : ****************** Epoch 12 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_12 *******************
2017-01-15 00:29:05,477 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_12/model
2017-01-15 00:29:22,842 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_12/model.docvecs.* with mmap=None
2017-01-15 00:29:22,843 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_12/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-15 00:29:38,264 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-15 00:48:33,428 : INFO : Getting Validation Embeddings
2017-01-15 00:48:33,431 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.043, Avg Labels: 1.150, 
		 Top 1: 0.594, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.618, F1 Macro: 0.521, Total Pos: 2,573,969


2017-01-15 00:49:16,532 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.002, Avg Labels: 1.150, 
		 Top 1: 0.569, Top 3: 0.891, Top 5: 0.952, 
		 F1 Micro: 0.610, F1 Macro: 0.516, Total Pos: 680,421


2017-01-15 00:49:46,555 : INFO : ****************** Epoch 13 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_13 *******************
2017-01-15 00:49:46,565 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_13/model
2017-01-15 00:50:04,040 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_13/model.docvecs.* with mmap=None
2017-01-15 00:50:04,041 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_13/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-15 00:50:22,966 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-15 01:07:38,644 : INFO : Getting Validation Embeddings
2017-01-15 01:07:38,648 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.041, Avg Labels: 1.150, 
		 Top 1: 0.594, Top 3: 0.898, Top 5: 0.960, 
		 F1 Micro: 0.619, F1 Macro: 0.521, Total Pos: 2,569,359


2017-01-15 01:08:24,588 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [1 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 2.983, Avg Labels: 1.150, 
		 Top 1: 0.568, Top 3: 0.892, Top 5: 0.951, 
		 F1 Micro: 0.613, F1 Macro: 0.518, Total Pos: 677,624


2017-01-15 01:08:54,628 : INFO : ****************** Epoch 14 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_14 *******************
2017-01-15 01:08:54,644 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_14/model
2017-01-15 01:09:01,954 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_14/model.docvecs.* with mmap=None
2017-01-15 01:09:01,955 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_14/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-15 01:09:20,298 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec

[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]]


2017-01-15 01:28:48,772 : INFO : Getting Validation Embeddings
2017-01-15 01:28:48,776 : INFO : ===== Loading validation vectors


** Training Metrics: Cov Err: 3.047, Avg Labels: 1.150, 
		 Top 1: 0.593, Top 3: 0.897, Top 5: 0.960, 
		 F1 Micro: 0.618, F1 Macro: 0.520, Total Pos: 2,574,237


2017-01-15 01:29:36,730 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 0]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 2.962, Avg Labels: 1.150, 
		 Top 1: 0.577, Top 3: 0.895, Top 5: 0.951, 
		 F1 Micro: 0.616, F1 Macro: 0.520, Total Pos: 674,243


2017-01-15 01:30:06,885 : INFO : ****************** Epoch 15 --- Working on doc2vec_size_200_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_15 *******************
2017-01-15 01:30:06,887 : INFO : training model with 22 workers on 391521 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=10
2017-01-15 01:30:06,888 : INFO : expecting 1651226 sentences, matching count from corpus used for vocabulary survey
2017-01-15 01:30:06,912 : INFO : Loading new batch for index: 0
2017-01-15 01:30:31,250 : INFO : Finished loading new batch
2017-01-15 01:30:32,369 : INFO : PROGRESS: at 0.00% examples, 156 words/s, in_qsize 0, out_qsize 2
2017-01-15 01:30:52,352 : INFO : PROGRESS: at 0.22% examples, 344593 words/s, in_qsize 43, out_qsize 0
2017-01-15 01:31:12,365 : INFO : PROGRESS: at 0.46% examples, 490311 words/s, in_qsize 43, out_qsize 0


In [36]:
GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(6)
info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))

# if we have the model, just load it, otherwise train the previous model
if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
    doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))

2017-01-07 08:32:48,888 : INFO : ****************** Epoch 6 --- Working on doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6 *******************
2017-01-07 08:32:48,891 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model
2017-01-07 08:33:14,046 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.docvecs.* with mmap=None
2017-01-07 08:33:14,048 : INFO : loading doctag_syn0 from /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-07 08:33:21,936 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_mode

In [35]:
%%time
ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

# get the word2vec analogy accuracy score
%time word2vec_result = doc2vec_model.accuracy(word2vec_questions_file, restrict_vocab=None)
epoch_word2vec_metrics.append(word2vec_result)
pickle.dump(word2vec_result, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME,
                                               WORD2VEC_METRICS_FILENAME), 'w'))


2017-01-07 08:05:32,383 : INFO : saving Doc2Vec object under /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model, separately None
2017-01-07 08:05:32,384 : INFO : storing numpy array 'doctag_syn0' to /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.docvecs.doctag_syn0.npy
2017-01-07 08:06:07,546 : INFO : storing numpy array 'syn1neg' to /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.syn1neg.npy
2017-01-07 08:06:22,413 : INFO : not storing attribute syn0norm
2017-01-07 08:06:22,414 : INFO : storing numpy array 'syn0' to /big/s/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_500_w_8_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_6/model.syn

CPU times: user 3h 47min 44s, sys: 4h 31min 2s, total: 8h 18min 46s
Wall time: 21min 14s
CPU times: user 3h 48min 38s, sys: 4h 31min 12s, total: 8h 19min 51s
Wall time: 23min 18s


In [75]:
from multiprocessing.dummy import Pool as ThreadPool 

In [43]:
import thesis.utils.metrics

In [84]:
validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                  validation_preprocessed_docids_files_prefix, batch_size=None)
i=0
doc_contents = []
for (doc_id, doc_contents_array) in validation_docs_iterator:
    i += 1
    doc_contents.append((doc_id, doc_contents_array))
    if i > 100:
        break

2016-12-28 22:22:21,536 : INFO : Loading new batch
2016-12-28 22:22:27,109 : INFO : Finished loading new batch


In [133]:
def infer_one_doc(doc):
    #doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
    rep = doc2vec_model.infer_vector(doc[1])
    return (doc[0], rep)

Threaded Version

In [114]:
%%time
pool = ThreadPool(16)
doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
threaded_reps = pool.map(infer_one_doc, doc_contents)


CPU times: user 1min 41s, sys: 620 ms, total: 1min 41s
Wall time: 9.83 s


Non-Threaded Version

In [112]:
%%time
reps = []
doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
for doc in doc_contents:
    reps.append((doc[0], doc2vec_model.infer_vector(doc[1])))

CPU times: user 14 s, sys: 44 ms, total: 14.1 s
Wall time: 13.9 s


In [92]:
[d[0] for d in threaded_reps][:20]

[u'08825480',
 u'08774433',
 u'08791071',
 u'08912011',
 u'08678092',
 u'08859194',
 u'08635554',
 u'08914715',
 u'08740442',
 u'08740792',
 u'08741891',
 u'08889791',
 u'08845058',
 u'08675352',
 u'08910298',
 u'08908470',
 u'07336611',
 u'07370801',
 u'08923495',
 u'08730828']

In [94]:
[d[0] for d in reps][:20]

[u'08825480',
 u'08774433',
 u'08791071',
 u'08912011',
 u'08678092',
 u'08859194',
 u'08635554',
 u'08914715',
 u'08740442',
 u'08740792',
 u'08741891',
 u'08889791',
 u'08845058',
 u'08675352',
 u'08910298',
 u'08908470',
 u'07336611',
 u'07370801',
 u'08923495',
 u'08730828']

In [100]:
np.array_equal([d[0] for d in reps], [d[0] for d in threaded_reps])

True

In [113]:
reps[0][1]

array([ 0.30695215,  0.62732637, -0.16665867,  0.21510798, -0.31053102,
        0.24912256,  0.27854046, -0.18195362, -0.2556676 , -0.5964365 ,
        0.21105912, -0.23973639, -0.03185667, -0.07150706,  0.34752986,
       -0.10195051, -0.21096784, -0.16357803, -0.36328176,  0.69572109,
        0.56532162, -0.2350243 ,  0.29052514,  0.08191228,  0.35617095,
       -0.04608935, -0.22245102, -0.2092436 ,  0.03193387,  0.20119652,
        0.41143674, -0.00198068,  0.2738685 ,  0.53701001, -0.1117554 ,
       -0.03540101, -0.34937236, -0.79319656, -0.24756837,  0.25518459,
       -0.13143119, -0.28934225,  0.40138   , -0.98963302,  0.13317154,
       -0.78089136,  0.02822817,  0.09919885,  0.06839398,  1.14812255,
       -0.35712692,  0.03212974,  0.31967002,  0.01885306,  0.32403627,
        0.06881366, -0.36663699, -0.06164655, -0.50977266,  0.13202219,
        0.34584206, -0.23481339, -0.26995379, -0.05701207,  0.09176121,
        0.05095135, -0.33242008,  0.24291369,  0.01117826,  0.10

In [116]:
threaded_reps[0][1]

array([ 0.44398868,  0.72295773, -0.1431679 ,  0.12140259, -0.35566956,
        0.25285348,  0.25987589, -0.25635543, -0.27375746, -0.68261814,
        0.2418227 , -0.31993562, -0.04068605, -0.19011304,  0.25827357,
       -0.23827454, -0.14665996, -0.24676035, -0.31205919,  0.7144047 ,
        0.51341397, -0.27819353,  0.2297533 ,  0.21143083,  0.21526954,
       -0.12819654, -0.23137507, -0.18031113, -0.0302615 ,  0.25606248,
        0.46331209,  0.03231328,  0.29240945,  0.55419838, -0.08406049,
       -0.02534914, -0.30697635, -0.89490503, -0.25361407, -0.0620365 ,
       -0.10279118, -0.24122375,  0.2858358 , -0.89883715,  0.2349384 ,
       -0.77614403,  0.07704009,  0.00219567,  0.05001302,  1.00935435,
       -0.43593982,  0.03736721,  0.49705869, -0.04218138,  0.37695912,
        0.0875724 , -0.41761422,  0.01351045, -0.63688326,  0.19897321,
        0.29641187, -0.23571339, -0.14795278, -0.12061672,  0.1306804 ,
        0.2521036 , -0.36068025,  0.26970887, -0.00897445,  0.17

In [121]:
import itertools

Simple threading, but problem is that pool.map exhausts the whole iterable

In [120]:
%%time
validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                  validation_preprocessed_docids_files_prefix, batch_size=None)
pool = ThreadPool(16)
threaded_reps = pool.map(infer_one_doc, validation_docs_iterator)

2016-12-28 23:52:07,981 : INFO : Loading new batch for index: 0


CPU times: user 24 ms, sys: 12 ms, total: 36 ms
Wall time: 26.4 ms


2016-12-28 23:52:14,764 : INFO : Finished loading new batch
2016-12-28 23:52:16,320 : INFO : Loading new batch for index: 10000
2016-12-28 23:52:36,742 : INFO : Finished loading new batch
2016-12-28 23:52:37,438 : INFO : Loading new batch for index: 12412
2016-12-28 23:52:37,487 : INFO : No more batches to load, exiting at index: 12412


More advanced threading

In [140]:
def nothing_func(doc):
    1 + 1

In [158]:
%%time
validation_docs_iterator = DocumentBatchGenerator2(validation_preprocessed_files_prefix, 
                                                  validation_preprocessed_docids_files_prefix, batch_size=None)
generator_func = validation_docs_iterator.__iter__()
pool = ThreadPool(16)
# map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
threaded_reps = {}
mini_batch_size = 1000
while True:
    threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
    info("Finished: {}".format(str(validation_docs_iterator.curr_index)))
    if threaded_reps_partial:
        #threaded_reps.extend(threaded_reps_partial)
        threaded_reps.update(threaded_reps_partial)
    else:
        break

2016-12-29 02:08:42,347 : INFO : Loading new batch for index: 0
2016-12-29 02:08:49,943 : INFO : Finished loading new batch
2016-12-29 02:09:13,387 : INFO : Finished: 1000
2016-12-29 02:09:36,895 : INFO : Finished: 2000


KeyboardInterrupt: 

In [154]:
class DocumentBatchGenerator2(object):
    def __init__(self, filename_prefix, filename_docids_prefix, batch_size=10000 ):
        """
        batch_size cant be > 10,000 due to a limitation in doc2vec training, 
        None means no batching (only use for inference)
        """
        assert batch_size <= 10000 or batch_size is None
        self.filename_prefix = filename_prefix
        self.filename_docids_prefix = filename_docids_prefix
        self.curr_lines = []
        self.curr_docids = []
        self.batch_size = batch_size
        self.curr_index = 0
        self.batch_end = -1
    def load_new_batch_in_memory(self):
        self.curr_lines, self.docids = [], []
        info("Loading new batch for index: {}".format(self.curr_index) )
        try:
            with open(self.filename_prefix + str(self.curr_index)) as preproc_file:
                for line in preproc_file:
                    self.curr_lines.append(line.split(" "))
#                     if i % 1000 == 0:
#                         print i
            self.curr_docids = pickle.load(open(self.filename_docids_prefix + str(self.curr_index), "r"))
            self.batch_end = self.curr_index + len(self.curr_lines) -1 
            info("Finished loading new batch")
        except IOError:
            info("No more batches to load, exiting at index: {}".format(self.curr_index))
            raise StopIteration()
    def __iter__(self):
        while True:
            if self.curr_index > self.batch_end:
                self.load_new_batch_in_memory()
            for (doc_id, tokens) in zip(self.curr_docids, self.curr_lines):
                if self.batch_size is not None:
                    curr_batch_iter = 0
                    # divide the document to batches according to the batch size
                    while curr_batch_iter < len(tokens):
                        self.curr_index += 1
                        yield LabeledSentence(words=tokens[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id])
                        curr_batch_iter += self.batch_size
                else:
                    self.curr_index += 1
                    yield doc_id, tokens


In [139]:
len(validation_docs_list)

12412

In [138]:
len(threaded_reps)

13000

In [None]:
%%time
validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                  validation_preprocessed_docids_files_prefix, batch_size=None)
pool = ThreadPool(20)
# map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
threaded_reps = []
mini_batch_size = 1000
while True:
    threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(validation_docs_iterator, mini_batch_size))
    info("Finished: {}".format(str(validation_docs_iterator.curr_index)))
    if threaded_reps_partial:
        threaded_reps.extend(threaded_reps_partial)
    else:
        break

In [126]:
def g():
    for el in xrange(50):
        yield el

go = g()
result = []
N = 10
for i in itertools.islice(go, N):
    print i

0
1
2
3
4
5
6
7
8
9


In [73]:
validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                  validation_preprocessed_docids_files_prefix, batch_size=None)
doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
i = 0
val_docs_reps = {}
for (doc_id, doc_contents_array) in validation_docs_iterator:
    i += 1
    %time val_docs_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array, steps=15)
    if i > 2:
        break
    
print val_docs_reps

2016-12-28 18:35:17,009 : INFO : Loading new batch
2016-12-28 18:35:21,915 : INFO : Finished loading new batch


CPU times: user 848 ms, sys: 8 ms, total: 856 ms
Wall time: 852 ms
CPU times: user 448 ms, sys: 12 ms, total: 460 ms
Wall time: 457 ms
CPU times: user 248 ms, sys: 4 ms, total: 252 ms
Wall time: 250 ms
{u'08774433': array([-0.24957576, -0.44971526, -0.56399536,  0.26340374, -0.72280848,
        0.07701001,  0.53518784,  0.47400331,  0.00103833, -0.21523826,
        0.5090881 , -0.66132861, -0.54403561,  0.44076878,  0.00470332,
       -0.60674977,  0.25171441,  0.01955804, -0.42058256,  0.12502177,
       -0.16908602, -0.77621526,  0.66039973,  0.22638585, -0.14937432,
        0.18073724, -0.22520301, -0.01612019, -0.94866085, -0.56705993,
       -0.31372947, -0.61444628,  0.36096638,  0.05321291,  0.31520829,
       -0.78794104, -0.02634728,  0.27075273, -0.69757801, -0.11887208,
        0.24548931, -0.37358913,  0.30241317, -0.02301121, -0.16444607,
        0.32210201, -0.49894542,  0.47786587, -0.7696104 ,  0.57316011,
        0.75851572, -0.29875955, -0.39299953,  0.29592195,  0.01

In [196]:
X, y = get_training_data(doc2vec_model, classifications)

In [202]:
yc = np.array(y)

In [203]:
yc.shape

(49789, 8)

In [None]:
from sklearn.svm import SVC

In [219]:
clf = OneVsRestClassifier(estimator=linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/C parameter
                                                         alpha=0.001, fit_intercept=True, n_iter=10,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=1, epsilon=0.1, n_jobs=-1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=None, warm_start=False), n_jobs=1)

# clf = OneVsRestClassifier(estimator=SVC(kernel='linear'), n_jobs=-1)

In [222]:
yp = clf.predict(X)

print yp

training_metrics = get_metrics(np.array(y), yp, yp)

[[0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [173]:
import numpy as np
from sklearn import linear_model
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
Y = np.array(['a', 'a', 'b', 'b'])
clf = linear_model.SGDClassifier()
clf.fit(X, Y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [None]:
clf.predict([[-0.8, -1]])

In [26]:
best_svm_epoch = 13

In [29]:
GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(best_svm_epoch)
doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))

2016-11-26 18:25:14,192 : INFO : loading Doc2Vec object from /mnt/data2/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_13/model
2016-11-26 18:25:14,584 : INFO : loading docvecs recursively from /mnt/data2/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_13/model.docvecs.* with mmap=None
2016-11-26 18:25:14,585 : INFO : loading doctag_syn0 from /mnt/data2/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_13/model.docvecs.doctag_syn0.npy with mmap=None
2016-11-26 18:25:14,814 : INFO : loading syn1neg from /mnt/data2/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_13/model.syn1neg.npy with mmap

#### Prepare Training Data

In [31]:
%%time
classifications = sections

NN_OUTPUT_NEURONS = len(classifications)
one_hot_encoder = OneHotEncoder(classifications)
training_data = []
training_labels = []
for doc_id in training_docs_list:
    # converting from memmap to a normal array
    normal_array = []
    normal_array[:] = doc2vec_model.docvecs[doc_id][:]
    training_data.append(normal_array)
    eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
    training_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))

CPU times: user 1.61 s, sys: 520 ms, total: 2.13 s
Wall time: 1.93 s


#### Prepare Validation Data

In [32]:
%%time

validation_labels = []
validation_data = pickle.load(open(
        os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)
))
print validation_data.shape

for validation_doc_id in validation_docs_list:
    eligible_classifications = [clssf for clssf in doc_classification_map[validation_doc_id] if clssf in classifications]
    validation_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))

(1969, 3000)
CPU times: user 2.58 s, sys: 360 ms, total: 2.94 s
Wall time: 2.93 s


#### Create Keras NN

In [173]:
import itertools
from keras.layers import Dropout

In [None]:
%%time
history_list = []
hidden_layer_sizes = [500,1500,3000,4500,6000]
activations = ['relu','sigmoid', 'tanh', 'linear', 'softmax']
input_dropout_options = [False, True]
hidden_dropout_options = [False, True]

params = list(itertools.product(hidden_layer_sizes, activations, input_dropout_options, hidden_dropout_options))
for layer_size, activation_func, input_dropout_do, hidden_dropout_do in params:
    print "===================================================================================\n" + \
          "========== Layer Size: {}, Activation: {}, Input Dropout: {}, Hidden Dropout: {} ==========================""".format(layer_size, activation_func, input_dropout_do, hidden_dropout_do)
    doc_input = Input(shape=(DOC2VEC_SIZE,), name='doc_input')
    if input_dropout_do:
        hidden = Dropout(0.7)(doc_input)
    hidden = Dense(layer_size, activation=activation_func, name='hidden_layer_{}'.format(activation_func))(doc_input if not input_dropout_do else hidden)
    if hidden_dropout_do:
        hidden = Dropout(0.5)(hidden)
    softmax_output = Dense(NN_OUTPUT_NEURONS, activation='sigmoid', name='softmax_output')(hidden)
    model = Model(input=doc_input, output=softmax_output)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', 
                  metrics=['accuracy', 'fbeta_score', theano_coverage_error])
    model.summary()
    history = model.fit(x=training_data, y=training_labels, 
          validation_data=(validation_data, validation_labels), 
          nb_epoch=NN_EPOCHS, verbose=0)
    history_list.append(history)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
doc_input (InputLayer)           (None, 3000)          0                                            
____________________________________________________________________________________________________
hidden_layer_relu (Dense)        (None, 500)           1500500     doc_input[0][0]                  
____________________________________________________________________________________________________
softmax_output (Dense)           (None, 8)             4008        hidden_layer_relu[0][0]          
Total params: 1504508
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Co

In [186]:
pickle.dump(history_list, open('/mnt/data2/shalaby/history_list_sample_0.0001.pickle','w'))

In [181]:
print len(history_list)

100


In [185]:
for history in history_list:
    hist = history.history
    max_val_fbeta = max(hist['val_coverage error'])
    print max_val_fbeta

2.66835957339
2.51193499238
2.40680548502
2.281361097
2.76536312849
2.42001015744
2.3844591163
2.3092940579
3.03910614525
2.50482478415
2.52412392077
2.38547486034
2.89588623667
2.85017775521
2.46267140681
2.45962417471
2.91010665312
2.81513458608
3.01218892839
3.13052310818
2.61097003555
2.64144235653
2.39918740477
2.29456576943
2.89893346877
2.5876079228
2.49212798375
2.42153377349
3.01422041646
2.63484002031
2.50837988827
2.39969527679
2.87455561199
2.94261046216
2.50431691214
2.52615540884
2.71203656679
2.96901980701
2.96597257491
3.17013712544
2.81665820213
2.87760284408
2.39918740477
2.3001523616
3.03098019299
2.67394616557
2.64347384459
2.6719146775
2.95226003047
2.68359573388
2.50279329609
2.49771457593
2.89233113255
2.9939055358
2.54647028949
2.63941086846
2.93143727781
2.97206703911
3.06094464195
3.22346368715
2.93905535805
3.12341289995
2.39969527679
2.32453021838
3.03859827324
2.76231589639
2.66124936516
2.64550533266
2.96800406298
2.75825292026
2.5281868969
2.60589131539
2

In [147]:
doc_input = Input(shape=(DOC2VEC_SIZE,), name='doc_input')
hidden = Dense(NN_HIDDEN_NEURONS, activation='relu', name='hidden_layer')(doc_input)
softmax_output = Dense(NN_OUTPUT_NEURONS, activation='sigmoid', name='softmax_output')(hidden)
model = Model(input=doc_input, output=softmax_output)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy', 'fbeta_score', theano_coverage_error])

In [148]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
doc_input (InputLayer)           (None, 3000)          0                                            
____________________________________________________________________________________________________
hidden_layer (Dense)             (None, 4500)          13504500    doc_input[0][0]                  
____________________________________________________________________________________________________
softmax_output (Dense)           (None, 8)             36008       hidden_layer[0][0]               
Total params: 13540508
____________________________________________________________________________________________________


In [162]:
model.fit(x=training_data, y=training_labels, 
          validation_data=(validation_data, validation_labels), 
          nb_epoch=1, verbose=1)

Train on 8979 samples, validate on 1969 samples
Epoch 1/1


In [143]:
val_prediction = model.predict(validation_data)

In [144]:
val_prediction[:2]

array([[  2.70263344e-01,   3.49998474e-02,   3.13610617e-05,
          1.08150870e-03,   3.11665332e-07,   7.09001958e-01,
          4.97711152e-02,   2.63609409e-01],
       [  1.70166213e-02,   5.36046147e-01,   2.61311390e-04,
          4.87348643e-06,   1.27638310e-01,   9.57404263e-03,
          3.39831635e-02,   2.13784515e-03]], dtype=float32)

In [146]:
validation_labels[:2]

[[0, 0, 0, 0, 0, 0, 1, 1], [0, 1, 0, 0, 0, 0, 0, 0]]

In [124]:
y_true = T.matrix('y_true')
y_score = T.matrix('y_score')

y_score_true = y_true * y_score # mark the scores of actually true labels
zero_true_elem = T.eq(y_true, 0).nonzero()
y_score_masked = T.set_subtensor(y_score_true[zero_true_elem], 100)
#zero_elements = T.eq(true_scores,0)
min_true_scores = T.min(y_score_masked, axis=1, keepdims=True) # we do keepdims in order to keep the broadcastable columns
coverage_per_row = (y_score >= min_true_scores).sum(axis=1)
coverage = T.mean(coverage_per_row)
theano_coverage_err_func = function(inputs=[y_true, y_score], outputs=coverage)

In [109]:
uu = np.array([[1,0,1],[0,0,1]], dtype='float32')

In [118]:
yc = T.set_subtensor(y_true[T.eq(y_true,0).nonzero()], 100)

In [119]:
yc.eval({y_true: uu})

array([[   1.,  100.,    1.],
       [ 100.,  100.,    1.]], dtype=float32)

In [None]:
T.set_subtensor(y_true[])

In [81]:
validation_labels[:1]

[[0, 0, 0, 0, 0, 0, 1, 1]]

In [82]:
val_prediction[:1]

array([[  8.08178342e-08,   1.02713175e-05,   4.41441728e-28,
          1.19779872e-36,   0.00000000e+00,   1.00000000e+00,
          2.75591228e-05,   2.31643662e-06]], dtype=float32)

In [136]:
%%time
theano_coverage_err_func(validation_labels, val_prediction)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 8.01 ms


array(2.5129507364144237)

In [98]:
np.multiply(validation_labels[:1] , 100) + val_prediction[:1]

array([[  8.08178342e-08,   1.02713175e-05,   4.41441728e-28,
          1.19779872e-36,   0.00000000e+00,   1.00000000e+00,
          1.00000028e+02,   1.00000002e+02]])

In [97]:
val_prediction[0,7] + 100

100.00000231643662

In [101]:
(np.multiply(validation_labels[:1] , -100) + val_prediction[:1]).min(axis=1) 

array([-99.99999768])

In [137]:
%%time
coverage_error(validation_labels, val_prediction)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.51 ms


2.5129507364144237

In [47]:
model.predict(training_data[:5])

array([[  1.43536258e-08,   2.79983366e-03,   1.15342544e-08,
          7.79373941e-17,   5.66347130e-03,   9.99988794e-01,
          6.53333089e-04,   5.85054120e-20],
       [  2.85919495e-02,   4.81873751e-02,   1.16070651e-05,
          2.11304723e-04,   9.95886266e-01,   7.23218254e-05,
          2.44005350e-03,   9.50191250e-08],
       [  1.57324195e-01,   4.55876261e-01,   1.18607618e-01,
          1.59025192e-02,   5.29134236e-02,   3.10803294e-01,
          7.60788023e-02,   4.09732945e-02],
       [  8.80629957e-01,   8.06344330e-01,   8.66507888e-01,
          8.60296586e-06,   8.48201476e-03,   3.66728357e-03,
          4.01142472e-03,   6.63176891e-09],
       [  3.67009136e-07,   2.43742179e-04,   4.91571154e-05,
          3.91778943e-16,   2.10585220e-08,   9.19317733e-03,
          2.35959844e-04,   9.99994278e-01]], dtype=float32)

In [69]:
training_labels[:5]

[[0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1]]

In [60]:
model.predict(validation_data[:5])

array([[  1.07762730e-06,   4.37068702e-05,   8.50037267e-16,
          3.26617112e-20,   5.99386426e-25,   9.99954700e-01,
          4.03022398e-08,   5.06598099e-07],
       [  7.84522370e-02,   7.51568982e-03,   3.02292941e-10,
          1.27776785e-27,   8.58146071e-01,   5.19246235e-02,
          3.96136660e-03,   9.01197339e-09],
       [  8.13455582e-01,   3.37661535e-04,   1.86206713e-01,
          2.93652289e-25,   1.42611062e-11,   8.11548398e-11,
          1.56509191e-13,   3.87454735e-10],
       [  9.48790824e-10,   4.76847440e-02,   2.83465356e-01,
          2.84471139e-27,   8.79647612e-15,   5.65862817e-15,
          4.19551939e-01,   2.49298021e-01],
       [  1.14569569e-10,   1.01953819e-12,   1.14190914e-01,
          4.46660243e-30,   3.54068044e-18,   8.85809124e-01,
          1.19672533e-14,   2.22166152e-09]], dtype=float32)

In [61]:
validation_labels[:5]

[[0, 0, 0, 0, 0, 0, 1, 1],
 [0, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0]]

In [None]:
%%time
%matplotlib notebook
graph = MetricsGraph()
graph.init_graph()
# when resuming, resume from an epoch with a previously created doc2vec model to get the learning rate right
start_from = 1
for epoch in range(start_from,DOC2VEC_MAX_EPOCHS+1):
    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))
    
    # if we have the model, just load it, otherwise train the previous model
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
    else:
        # train the doc2vec model
        doc2vec_model.train(sentences=StochasticDocumentGenerator(training_file, training_docs_list, line_positions), 
                            report_delay=REPORT_DELAY)
        #doc2vec_model.alpha -= 0.001  # decrease the learning rate
        #doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
        ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
        doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
        GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

#     # Training and validation of SVMs using those docvecs
#     train_classifications(sections)
#     validation_vectors_matrix = get_validation_docs_with_inference(doc2vec_model, doc_classification_map)
#     metrics = do_validation(validation_vectors_matrix, doc_classification_map, sections, "sections")
#     ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
#                                              GLOBAL_VARS.SVM_MODEL_NAME))
#     pickle.dump(metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, GLOBAL_VARS.SVM_MODEL_NAME, METRICS), 'w'))
#     print "Coverage Error: {}, Average No of Labels: {}, Top 1: {}, Top 3: {}, Top 5: {}, F1 Micro: {}, Total Positive: {}".format(
#         metrics['coverage_error'], metrics['average_num_of_labels'], metrics['top_1'], metrics['top_3'], metrics['top_5'], 
#         metrics['f1_micro'], metrics['total_positive'])
                                                                                     
#     epoch_metrics.append(metrics)
#     graph.add_metrics_to_graph(metrics, epoch)


<IPython.core.display.Javascript object>

2016-11-27 12:06:37,744 : INFO : ****************** Epoch 1 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_1 *******************
2016-11-27 12:06:37,755 : INFO : training model with 12 workers on 243681 vocabulary and 51000 features, using sg=0 hs=0 sample=1e-05 negative=10
2016-11-27 12:06:37,756 : INFO : expecting 49789 sentences, matching count from corpus used for vocabulary survey
2016-11-27 12:07:05,285 : INFO : PROGRESS: at 0.00% examples, 40 words/s, in_qsize 23, out_qsize 0
2016-11-27 12:08:05,390 : INFO : PROGRESS: at 0.18% examples, 1894 words/s, in_qsize 24, out_qsize 0
2016-11-27 12:09:05,994 : INFO : PROGRESS: at 0.37% examples, 2579 words/s, in_qsize 24, out_qsize 0
2016-11-27 12:10:06,043 : INFO : PROGRESS: at 0.61% examples, 2862 words/s, in_qsize 24, out_qsize 0
2016-11-27 12:11:06,643 : INFO : PROGRESS: at 0.83% examples, 2995 words/s, in_qsize 24, out_qsize 0
2016-11-27 12:12:07,595 : INFO : PROGRESS: at

## Plot loaded metrics

In [48]:
doc2vec_model_save_location

'/home/local/shalaby/parameter_search_doc2vec_models/sample_0.0001'

In [32]:
%matplotlib notebook
graph = MetricsGraph()
graph.init_graph()
print placeholder_model_name + "_" + GLOBAL_VARS.SVM_MODEL_NAME

for epoch in range(1,DOC2VEC_MAX_EPOCHS+1):
    try:
        model_name = placeholder_model_name.format(epoch)
        metrics = pickle.load(open(os.path.join(doc2vec_model_save_location, model_name, GLOBAL_VARS.SVM_MODEL_NAME, METRICS)))
        print "Epoch {:02d}: Coverage Error -> {:.2f}".format(epoch, metrics['coverage_error'])
        graph.add_metrics_to_graph(metrics, epoch)
    except IOError:
        break

<IPython.core.display.Javascript object>

doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None_curriter_{}_iter_1000_reg_0.001
Epoch 01: Coverage Error -> 6.56
Epoch 02: Coverage Error -> 6.51
Epoch 03: Coverage Error -> 6.56
Epoch 04: Coverage Error -> 6.45
Epoch 05: Coverage Error -> 6.49
Epoch 06: Coverage Error -> 6.39
Epoch 07: Coverage Error -> 6.48
Epoch 08: Coverage Error -> 6.55
Epoch 09: Coverage Error -> 6.51
Epoch 10: Coverage Error -> 6.54
Epoch 11: Coverage Error -> 6.59
Epoch 12: Coverage Error -> 6.50
Epoch 13: Coverage Error -> 4.30
Epoch 14: Coverage Error -> 4.44
Epoch 15: Coverage Error -> 4.58
Epoch 16: Coverage Error -> 4.75
Epoch 17: Coverage Error -> 4.79
Epoch 18: Coverage Error -> 4.75
Epoch 19: Coverage Error -> 4.68
Epoch 20: Coverage Error -> 4.69


In [119]:
[metric['coverage_error'] for metric in epoch_metrics]

[6.0518029456576938,
 4.3138649060436771,
 3.7851701371254443,
 1.4565769426104622,
 1.3819197562214323]