In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import logging
from logging import info
from functools import partial

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
IS_SAMPLE = True

In [4]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234

In [5]:
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 5
MIN_SIZE = 0
NUM_CORES = 20

In [6]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL', 'SVM_MODEL_NAME'])

In [7]:
SAMPLE_RATIO = 0.0001

In [8]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
METRICS = "metrics.pkl"

In [9]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"

save_parent_location = "hdfs://deka.cip.ifi.lmu.de/pg-vectors/"
if IS_SAMPLE: 
    save_parent_location = save_parent_location + "sample_" + str(SAMPLE_RATIO) + "/"


root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

doc2vec_model_save_location = os.path.join(root_location, "parameter_search_doc2vec_models", "sample_" + str(SAMPLE_RATIO))
if not os.path.exists(doc2vec_model_save_location):
    os.makedirs(doc2vec_model_save_location)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
    os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

#training_file = root_location + "docs_output.json"
training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"

In [10]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))

CPU times: user 28.6 s, sys: 1.7 s, total: 30.3 s
Wall time: 31.1 s


In [11]:
len(training_docs_list)

8979

In [12]:
len(validation_docs_list)

1969

In [13]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

In [14]:
def get_training_vector(classification, term_list, classifications, number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))

def train_level_new(docs_index, classification, doc_classification_map, number_of_terms):
    training_vectors = docs_index.map(
        lambda (doc_id, postings): get_training_vector(classification, postings,
                                                        doc_classification_map[doc_id], number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm

get_binary = lambda x: 1 if x > 0 else 0
get_binary = np.vectorize(get_binary)


def get_top_N_percentage(y_score, y_true, max_N=3):
    """
    Get percentage of correct labels that are in the top N scores
    """
    num_all_true = 0
    num_found_in_max_N = 0
    for i in xrange(y_score.shape[0]):
        y_score_row = y_score[i,:]
        y_true_row = y_true[i,:]
        desc_score_indices = np.argsort(y_score_row)[::-1]
        true_indices = np.where(y_true_row ==1)[0]
        
        num_true_in_row = len(true_indices)
        num_all_true += num_true_in_row
        for i, score_index in enumerate(desc_score_indices):
            # only iterate through the score list till depth N, but make sure you also account for the case where 
            # the number of true labels for the current row is higher than N
            if i >= max_N and i >= num_true_in_row:
                break
            if score_index in true_indices:
                num_found_in_max_N += 1
    return float(num_found_in_max_N)/ num_all_true


def get_metrics(y_true, y_score, y_binary_score):
    metrics = {}
    metrics['total_positive'] = np.sum(np.sum(y_binary_score))
    #TODO remove those two when running on the whole set to avoid excessive storage costs
    metrics['y_true'] = y_true
    metrics['y_score'] = y_score
    metrics['y_binary_score'] = y_binary_score
    metrics['coverage_error'] = coverage_error(y_true, y_binary_score)
    metrics['average_num_of_labels'] = np.sum(np.sum(y_true, axis=1))/y_true.shape[0]
    metrics['average_precision_micro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='micro')
    metrics['average_precision_macro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='macro')
    metrics['precision_micro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='micro')
    metrics['precision_macro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='macro')
    metrics['recall_micro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='micro')
    metrics['recall_macro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='macro')
    metrics['f1_micro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='micro')
    metrics['f1_macro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='macro')

    precision_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i])
    metrics['precision_scores_array'] = precision_scores.tolist()

    recall_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        recall_scores[i] = sklearn.metrics.recall_score(y_true[:,i], y_binary_score[:,i])
    metrics['recall_scores_array'] = recall_scores.tolist()

    f1_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        f1_scores[i] = sklearn.metrics.f1_score(y_true[:,i], y_binary_score[:,i])
    metrics['f1_scores_array'] = f1_scores.tolist()

    metrics['top_1'] = get_top_N_percentage(y_score, y_true, max_N=1)
    metrics['top_3'] = get_top_N_percentage(y_score, y_true, max_N=3)
    metrics['top_5'] = get_top_N_percentage(y_score, y_true, max_N=5)
    
    return metrics

def model_exists(path):
    try:
        model = SVMModel.load(sc, path)
        return True;
    except:
        return False
    
def get_training_vector(classification, dense_vector, classifications):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, dense_vector)

def train_level_doc2vec(classification, doc_classification_map):
    doc2vec_model = GLOBAL_VARS.DOC2VEC_MODEL
    training_vectors = []
    for doc_id in training_docs_list:
        # converting from memmap to a normal array as spark is unable to convert memmap to a spark Vector
        normal_array = []
        normal_array[:] = doc2vec_model.docvecs[doc_id][:]
        training_vectors.append(get_training_vector(classification, normal_array, 
                                                    doc_classification_map[doc_id]))
    info("Finished getting training vectors")
    training_vectors = sc.parallelize(training_vectors)
    info("Finished parallelization")
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm

In [15]:
def ensure_hdfs_location_exists(location):
    parent = os.path.dirname(location)
    os.system("hdfs dfs -mkdir -p " + location)

def ensure_disk_location_exists(location):
    if not os.path.exists(location):
        os.makedirs(location)

In [16]:
def train_classifications(classifications):
    info("====== Doing Training")
    i=0
    for classification in classifications:
        print classification
        try:
            model_path = get_svm_model_path(GLOBAL_VARS.MODEL_NAME, classification)
            if not model_exists(model_path):
                training_vectors, svm = train_level_doc2vec(classification, doc_classification_map)
                svm.save(sc, model_path)
            else:
                print "Model Exists"
        except:
            print "Problem creating: %s: %s" % (classification, GLOBAL_VARS.MODEL_NAME)
            raise

In [17]:
def do_validation(validation_vectors_matrix, doc_classification_map, classifications, classifications_name):

    info("====== Doing Validation")
    method = GLOBAL_VARS.MODEL_NAME
    subset = classifications_name

    doc_count = validation_vectors_matrix.shape[0]
    y_score = np.zeros((doc_count, len(classifications)))
    y_true = np.zeros((doc_count, len(classifications)))
    i=0

    for classification in classifications:
        print classification

        validation_vectors = get_validation_doc2vec_spark_vectors(validation_vectors_matrix, 
                                                                  classification, doc_classification_map)
        #global binarySvm
        binarySvm = SVMModel.load(sc, get_svm_model_path(GLOBAL_VARS.MODEL_NAME, classification))
        info("Loaded the model, Doing the prediction now....")
        binarySvm.clearThreshold()
        binarySvmB = sc.broadcast(binarySvm)
        # using the broadcasted binarySvm variable, fixes global name 'binarySvm' is not defined as this variable was not
        # available in the workers, so we pass it explicitly to the mapper using partial
        labels_predictions = validation_vectors.map( \
            partial(lambda svm, p: (p.label, svm.value.predict(p.features)), binarySvmB) \
        ).collect()
        #labels = test_labeled_points.map(lambda p: p.labels)
        y_true[:,i] = [label_pred[0] for label_pred in labels_predictions]
        y_score[:,i] = [label_pred[1] for label_pred in labels_predictions]
        i+=1
    y_binary_score = get_binary(y_score)
    # results[method]["y_true"] = y_true
    # results[method]["y_score"] = y_score
    # results[method]["y_binary_score"] = y_binary_score
    metrics = get_metrics(y_true, y_score, y_binary_score)
    return metrics


In [18]:
def get_validation_docs_with_inference(doc2vec_model, doc_classification_map):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
    else:
        validation_documents_reps = {}
        validation_vectors = []
        validation_labels = []
        info("===== Getting validation vectors with inference")

        # do inference and store results in dict
        i = 0
        for (doc_id, doc_contents_array) in ValidationDocumentGenerator(training_file, validation_docs_list):
            i += 1
            if i % 1000 == 0: info("Finished: {}".format(str(i)))
            validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)

        # create matrix for the validation vectors
        for validation_doc_id in validation_docs_list:
            validation_vectors.append(validation_documents_reps[validation_doc_id])
            validation_labels.append([classf for classf in doc_classification_map[validation_doc_id] if classf in sections])
        validation_vectors_matrix = np.array(validation_vectors)
        pickle.dump(validation_vectors_matrix, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX), 'w'))
    
    return validation_vectors_matrix

In [19]:
def get_validation_doc2vec_spark_vectors(validation_vectors_matrix, classification, doc_classification_map):
    validation_vectors = []
    for (index, doc_id) in enumerate(validation_docs_list):
        # converting from memmap to a normal array as spark is unable to convert memmap to a spark Vector
        validation_vector = validation_vectors_matrix[index]
        validation_vectors.append(get_training_vector(classification, validation_vector, 
                                                    doc_classification_map[doc_id]))
    validation_vectors = sc.parallelize(validation_vectors)
    info("Finished getting validation vectors")
    return validation_vectors

In [20]:
class MetricsGraph:
    def __init__(self):
        self.coverage_errors = []
        self.average_num_labels = []
        
        self.f1_micros = []
        self.precision_micros = []
        self.recall_micros = []
        self.f1_macros = []
        self.precision_macros = []
        self.recall_macros = []
        
        self.top_1s = []
        self.top_3s = []
        self.top_5s = []
        
        self.epochs = []
        
        self.fig = None
        self.ax = None
        self.ax2 = None
    
    def init_graph(self):
        self.fig = plt.figure(figsize=(12,6), dpi=80)
        self.ax = plt.subplot(121)
        self.ax2 = plt.subplot(122)
        self.fig.subplots_adjust(top=0.72, bottom=0.1, left=0.05, right=0.95)
        self.ax.set_xlabel("Epochs")
        self.ax2.set_xlabel("Epochs")
    
    def _add_metrics(self, metrics, epoch):
        self.coverage_errors.append(metrics['coverage_error'])
        self.average_num_labels.append(metrics['average_num_of_labels'])
        
        self.f1_micros.append(metrics['f1_micro'])
        self.precision_micros.append(metrics['precision_micro'])
        self.recall_micros.append(metrics['recall_micro'])
        self.f1_macros.append(metrics['f1_macro'])
        self.precision_macros.append(metrics['precision_macro'])
        self.recall_macros.append(metrics['recall_macro'])
        
        self.top_1s.append(metrics['top_1']  if 'top_1' in metrics else get_top_N_percentage(metrics['y_score'], metrics['y_true'], max_N=1))
        self.top_3s.append(metrics['top_3']  if 'top_3' in metrics else get_top_N_percentage(metrics['y_score'], metrics['y_true'], max_N=3))
        self.top_5s.append(metrics['top_5']  if 'top_5' in metrics else get_top_N_percentage(metrics['y_score'], metrics['y_true'], max_N=5))
        
        self.epochs.append(epoch)
        
    def add_metrics_to_graph(self, metrics, epoch, draw_now=True):
        
        self._add_metrics(metrics, epoch)
        if draw_now:
            self.draw()

    def draw(self):
        
        first_epoch = self.epochs[0]
        last_epoch = self.epochs[-1]
        
        coverage_error_line, = self.ax.plot(self.epochs, self.coverage_errors, 'r-', label='Coverage Error')
        average_num_labels_line, = self.ax.plot(self.epochs, self.average_num_labels, 'g-', label='Avg Num. of Labels')

        self.ax.legend(handles=[coverage_error_line, average_num_labels_line],
                  bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
        self.ax.axis([first_epoch, last_epoch, 0, 10])
        # show the average number of labels as a separate y-tick
        curr_min_cov = [min(self.coverage_errors)]
        prev_min_cov = [min(self.coverage_errors[:-1])] if len(self.coverage_errors) > 1 else []
        self.ax.set_yticks(list(set(self.ax.get_yticks())- set(prev_min_cov) ) + [self.average_num_labels[0]] + curr_min_cov)

        f1_micro_line, = self.ax2.plot(self.epochs, self.f1_micros, 'g-', label='F1 Micro')
        precision_micro_line, = self.ax2.plot(self.epochs, self.precision_micros, 'r-', label='Precision Micro')
        recall_micro_line, = self.ax2.plot(self.epochs, self.recall_micros, 'b-', label='Recall Micro')
        f1_macro_line, = self.ax2.plot(self.epochs, self.f1_macros, 'g--', label='F1 Macro')
        precision_macro_line, = self.ax2.plot(self.epochs, self.precision_macros, 'r--', label='Precision Macro')
        recall_macro_line, = self.ax2.plot(self.epochs, self.recall_macros, 'b--', label='Recall Macro')
        
        top_1_line, = self.ax2.plot(self.epochs, self.top_1s, 'g-.', label='Top 1 %')
        top_3_line, = self.ax2.plot(self.epochs, self.top_3s, 'r-.', label='Top 3 %')
        top_5_line, = self.ax2.plot(self.epochs, self.top_5s, 'b-.', label='Top 5 %')
        
        self.ax2.legend(handles=[f1_micro_line, precision_micro_line, recall_micro_line, 
                                 f1_macro_line, precision_macro_line, recall_macro_line,
                                 top_1_line, top_3_line, top_5_line],
                  bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
        self.ax2.axis([first_epoch, last_epoch, 0, 1])
        curr_max_f1_micro = [max(self.f1_micros)]
        prev_max_f1_micro = [max(self.f1_micros[:-1])] if len(self.f1_micros) > 1 else []
        self.ax2.set_yticks(list(set(self.ax2.get_yticks())- set(prev_max_f1_micro) ) + curr_max_f1_micro)
        
        self.fig.canvas.draw()

### Doc2vec and SVM Parameters

In [21]:
DOC2VEC_SIZE = 3000
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 1
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 0
DOC2VEC_CONCAT = 1
DOC2VEC_MEAN = 0
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 20
REPORT_DELAY = 60 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 1000 # report the progress every x terms

In [22]:
SVM_ITERATIONS = 1000
SVM_CONVERGENCE = 0.001
SVM_REG = 0.1
GLOBAL_VARS.SVM_MODEL_NAME = 'iter_{}_reg_{}'.format(SVM_ITERATIONS, SVM_REG)

In [23]:
def get_svm_model_path(method, classification, reg=SVM_REG, iterations=SVM_ITERATIONS):
    location = os.path.join(save_parent_location, "models", method, 
                            "iter_" + str(iterations) + "_reg_" + str(reg),
                            classification + "_model.svm")
    ensure_hdfs_location_exists(location)
    return location

class TrainingDocumentGenerator(object):
    def __init__(self, filename, training_docs_list):
        self.filename = filename
        self.training_docs_list = training_docs_list
    def __iter__(self):
        for line in open(self.filename):
            if not line.strip(): continue
            (doc_id, text) = eval(line)
            if doc_id in self.training_docs_list:
                yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
                
class ValidationDocumentGenerator(object):
    def __init__(self, filename, validation_docs_list):
        self.filename = filename
        self.validation_docs_list = validation_docs_list
    def __iter__(self):
        for line in open(self.filename):
            if not line.strip(): continue
            (doc_id, text) = eval(line)
            if doc_id in self.validation_docs_list:
                yield doc_id, stemtokenizer(text)

## Create the Doc2vec model

In [24]:
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE, 
                                                                DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE))
placeholder_model_name = placeholder_model_name + "_curriter_{}"
placeholder_model_name

'doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_{}'

In [25]:
doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that for now
                dbow_words=DOC2VEC_TRAIN_WORDS,
                iter=DOC2VEC_EPOCHS)

GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

In [26]:
%%time
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
    doc2vec_model.build_vocab(sentences=TrainingDocumentGenerator(training_file, training_docs_list), 
                              progress_per=REPORT_VOCAB_PROGRESS)
    doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
else: 
    doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model.reset_from(doc2vec_model_vocab_model)

2016-10-07 17:52:53,155 : INFO : loading Doc2Vec object from /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/vocab_model/model
2016-10-07 17:52:54,155 : INFO : loading docvecs recursively from /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/vocab_model/model.docvecs.* with mmap=None
2016-10-07 17:52:54,156 : INFO : loading syn1neg from /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/vocab_model/model.syn1neg.npy with mmap=None
2016-10-07 17:54:22,216 : INFO : loading syn0 from /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/vocab_model/model.syn0.npy with mmap=None
2016-10-07 17:54:22,392 : INFO : setting ignored attribute syn0norm to None
2016-10-07 17:54:22,393 : INFO : setting ignored attribute cum_table to None
2016-10-07 17:54:22,746 : INFO : using concatenative 51000-dimensional layer1
2016-10-07 17:54:22,747 : INFO : resetting layer weights


CPU times: user 12.9 s, sys: 5.91 s, total: 18.8 s
Wall time: 1min 42s


## Actual Training, validation and Metrics Loop

In [27]:
doc2vec_model.min_alpha = 0.025
epoch_metrics = []

In [34]:
%%time
doc2vec_model.build_vocab(sentences=TrainingDocumentGenerator(training_file, training_docs_list), 
                              progress_per=REPORT_VOCAB_PROGRESS)
#doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL + "_hs", MODEL_PREFIX))

2016-10-03 19:00:38,690 : INFO : collecting all words and their counts
2016-10-03 19:00:38,740 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2016-10-03 19:01:37,770 : INFO : PROGRESS: at example #1000, processed 6730672 words (114024/s), 101429 word types, 1000 tags
2016-10-03 19:02:36,288 : INFO : PROGRESS: at example #2000, processed 13377669 words (113592/s), 173358 word types, 2000 tags
2016-10-03 19:03:32,512 : INFO : PROGRESS: at example #3000, processed 19721952 words (112841/s), 237449 word types, 3000 tags
2016-10-03 19:04:28,242 : INFO : PROGRESS: at example #4000, processed 26105730 words (114552/s), 283534 word types, 4000 tags
2016-10-03 19:05:25,009 : INFO : PROGRESS: at example #5000, processed 32474320 words (112191/s), 327931 word types, 5000 tags
2016-10-03 19:06:23,327 : INFO : PROGRESS: at example #6000, processed 39183207 words (115041/s), 377546 word types, 6000 tags
2016-10-03 19:07:20,243 : INFO : PROGRESS: at example #7000, pro

CPU times: user 8min 58s, sys: 6.32 s, total: 9min 4s
Wall time: 8min 59s


In [35]:
doc2vec_model.syn0

array([[  3.12148368e-05,   1.63139775e-04,   1.35370967e-04, ...,
         -9.58132441e-05,  -2.63783822e-05,  -2.96910421e-05],
       [ -2.87129933e-05,  -7.38100207e-05,   1.01923499e-04, ...,
          8.03500297e-05,  -8.48640993e-05,   1.07045984e-04],
       [  5.01288669e-05,   4.39545511e-05,   1.37585710e-04, ...,
         -5.13341147e-05,  -1.55190297e-04,  -1.65183010e-05],
       ..., 
       [ -1.21397643e-04,  -8.22615621e-05,  -1.23961741e-04, ...,
         -5.87554168e-05,   4.34706417e-05,   1.66552727e-05],
       [ -1.63632085e-05,   4.42360943e-05,   4.06701747e-06, ...,
          1.64604600e-04,  -1.00427724e-05,   3.35616169e-05],
       [ -1.03512852e-04,   5.39240791e-05,   4.61474156e-05, ...,
         -7.31005866e-06,  -1.27907493e-04,  -2.00724173e-07]], dtype=float32)

In [None]:
%%time
%matplotlib notebook
graph = MetricsGraph()
graph.init_graph()
# when resuming, resume from an epoch with a previously created doc2vec model to get the learning rate right
start_from = 1
for epoch in range(start_from,DOC2VEC_MAX_EPOCHS+1):
    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    info("****************** Epoch {} --- Working on {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))
    
    if not os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        # if we have the model, just load it, otherwise train the previous model
        if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
            doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
            GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model
        else:
            # train the doc2vec model
            doc2vec_model.train(sentences=TrainingDocumentGenerator(training_file, training_docs_list), 
                                report_delay=REPORT_DELAY)
            doc2vec_model.alpha -= 0.001  # decrease the learning rate
            doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
            ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME))
            doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
            GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

    # Training and validation of SVMs using those docvecs
    train_classifications(sections)
    validation_vectors_matrix = get_validation_docs_with_inference(doc2vec_model, doc_classification_map)
    metrics = do_validation(validation_vectors_matrix, doc_classification_map, sections, "sections")
    ensure_disk_location_exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
                                             GLOBAL_VARS.SVM_MODEL_NAME))
    pickle.dump(metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, GLOBAL_VARS.SVM_MODEL_NAME, METRICS), 'w'))
    print "Coverage Error: {}, Average No of Labels: {}, Top 1: {}, Top 3: {}, Top 5: {}, F1 Micro: {}, Total Positive: {}".format(
        metrics['coverage_error'], metrics['average_num_of_labels'], metrics['top_1'], metrics['top_3'], metrics['top_5'], 
        metrics['f1_micro'], metrics['total_positive'])
                                                                  
    epoch_metrics.append(metrics)
    graph.add_metrics_to_graph(metrics, epoch)


<IPython.core.display.Javascript object>

2016-10-07 18:46:48,963 : INFO : ****************** Epoch 1 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_1 *******************


A
Model Exists
B
Model Exists
C
Model Exists
D
Model Exists
E
Model Exists
F
Model Exists
G
Model Exists
H


2016-10-07 18:47:22,577 : INFO : ===== Loading validation vectors


Model Exists




A


2016-10-07 18:47:27,131 : INFO : Finished getting validation vectors
2016-10-07 18:47:30,225 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 18:47:32,742 : INFO : Finished getting validation vectors
2016-10-07 18:47:35,846 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 18:47:39,287 : INFO : Finished getting validation vectors
2016-10-07 18:47:42,825 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 18:47:45,255 : INFO : Finished getting validation vectors
2016-10-07 18:47:48,788 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 18:47:52,342 : INFO : Finished getting validation vectors
2016-10-07 18:47:56,214 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 18:47:58,135 : INFO : Finished getting validation vectors
2016-10-07 18:48:01,477 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 18:48:03,203 : INFO : Finished getting validation vectors
2016-10-07 18:48:06,640 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 18:48:08,195 : INFO : Finished getting validation vectors
2016-10-07 18:48:11,493 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.90452006094, Average No of Labels: 1.35195530726, Top 1: 0.220510894065, Top 3: 0.385800150263, Top 5: 0.602930127724, F1 Micro: 0.248520710059, Total Positive: 7478


in singular transformations; automatically expanding.
left=1, right=1
  'left=%s, right=%s') % (left, right))
2016-10-07 18:48:12,972 : INFO : ****************** Epoch 2 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_2 *******************


A


2016-10-07 18:48:23,608 : INFO : Finished getting training vectors
2016-10-07 18:48:28,253 : INFO : Finished parallelization


B


2016-10-07 18:48:53,235 : INFO : Finished getting training vectors
2016-10-07 18:48:57,301 : INFO : Finished parallelization


C


2016-10-07 18:49:17,772 : INFO : Finished getting training vectors
2016-10-07 18:49:22,149 : INFO : Finished parallelization


D


2016-10-07 18:49:42,779 : INFO : Finished getting training vectors
2016-10-07 18:49:47,075 : INFO : Finished parallelization


E


2016-10-07 18:50:07,438 : INFO : Finished getting training vectors
2016-10-07 18:50:11,675 : INFO : Finished parallelization


F


2016-10-07 18:50:32,440 : INFO : Finished getting training vectors
2016-10-07 18:50:36,345 : INFO : Finished parallelization


G


2016-10-07 18:50:57,101 : INFO : Finished getting training vectors
2016-10-07 18:51:01,375 : INFO : Finished parallelization


H


2016-10-07 18:51:21,801 : INFO : Finished getting training vectors
2016-10-07 18:51:26,199 : INFO : Finished parallelization
2016-10-07 18:51:36,303 : INFO : ===== Loading validation vectors


A


2016-10-07 18:51:41,402 : INFO : Finished getting validation vectors
2016-10-07 18:51:44,906 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 18:51:46,375 : INFO : Finished getting validation vectors
2016-10-07 18:51:49,479 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 18:51:51,152 : INFO : Finished getting validation vectors
2016-10-07 18:51:54,638 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 18:51:56,132 : INFO : Finished getting validation vectors
2016-10-07 18:51:59,742 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 18:52:01,426 : INFO : Finished getting validation vectors
2016-10-07 18:52:04,919 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 18:52:06,390 : INFO : Finished getting validation vectors
2016-10-07 18:52:09,479 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 18:52:11,056 : INFO : Finished getting validation vectors
2016-10-07 18:52:14,459 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 18:52:16,061 : INFO : Finished getting validation vectors
2016-10-07 18:52:19,490 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.84662265109, Average No of Labels: 1.35195530726, Top 1: 0.20848985725, Top 3: 0.37490608565, Top 5: 0.605184072126, F1 Micro: 0.246994325286, Total Positive: 7735


2016-10-07 18:52:20,577 : INFO : ****************** Epoch 3 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_3 *******************


A


2016-10-07 18:52:31,002 : INFO : Finished getting training vectors
2016-10-07 18:52:35,188 : INFO : Finished parallelization


B


2016-10-07 18:52:56,199 : INFO : Finished getting training vectors
2016-10-07 18:53:00,481 : INFO : Finished parallelization


C


2016-10-07 18:53:21,716 : INFO : Finished getting training vectors
2016-10-07 18:53:25,668 : INFO : Finished parallelization


D


2016-10-07 18:53:46,199 : INFO : Finished getting training vectors
2016-10-07 18:53:50,422 : INFO : Finished parallelization


E


2016-10-07 18:54:11,241 : INFO : Finished getting training vectors
2016-10-07 18:54:15,235 : INFO : Finished parallelization


F


2016-10-07 18:54:35,205 : INFO : Finished getting training vectors
2016-10-07 18:54:39,491 : INFO : Finished parallelization


G


2016-10-07 18:54:59,799 : INFO : Finished getting training vectors
2016-10-07 18:55:03,957 : INFO : Finished parallelization


H


2016-10-07 18:55:24,334 : INFO : Finished getting training vectors
2016-10-07 18:55:28,359 : INFO : Finished parallelization
2016-10-07 18:55:38,506 : INFO : ===== Loading validation vectors


A


2016-10-07 18:55:44,025 : INFO : Finished getting validation vectors
2016-10-07 18:55:47,500 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 18:55:49,103 : INFO : Finished getting validation vectors
2016-10-07 18:55:52,436 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 18:55:54,061 : INFO : Finished getting validation vectors
2016-10-07 18:55:57,105 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 18:55:58,624 : INFO : Finished getting validation vectors
2016-10-07 18:56:02,052 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 18:56:03,900 : INFO : Finished getting validation vectors
2016-10-07 18:56:07,295 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 18:56:08,930 : INFO : Finished getting validation vectors
2016-10-07 18:56:12,223 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 18:56:13,971 : INFO : Finished getting validation vectors
2016-10-07 18:56:17,456 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 18:56:19,022 : INFO : Finished getting validation vectors
2016-10-07 18:56:22,240 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.85170137125, Average No of Labels: 1.35195530726, Top 1: 0.22539444027, Top 3: 0.393313298272, Top 5: 0.607438016529, F1 Micro: 0.247257464466, Total Positive: 7821


2016-10-07 18:56:23,229 : INFO : ****************** Epoch 4 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_4 *******************


A


2016-10-07 18:56:33,716 : INFO : Finished getting training vectors
2016-10-07 18:56:37,831 : INFO : Finished parallelization


B


2016-10-07 18:56:58,533 : INFO : Finished getting training vectors
2016-10-07 18:57:02,478 : INFO : Finished parallelization


C


2016-10-07 18:57:22,310 : INFO : Finished getting training vectors
2016-10-07 18:57:26,006 : INFO : Finished parallelization


D


2016-10-07 18:57:46,492 : INFO : Finished getting training vectors
2016-10-07 18:57:50,412 : INFO : Finished parallelization


E


2016-10-07 18:58:11,042 : INFO : Finished getting training vectors
2016-10-07 18:58:15,343 : INFO : Finished parallelization


F


2016-10-07 18:58:36,775 : INFO : Finished getting training vectors
2016-10-07 18:58:41,162 : INFO : Finished parallelization


G


2016-10-07 18:59:01,628 : INFO : Finished getting training vectors
2016-10-07 18:59:06,018 : INFO : Finished parallelization


H


2016-10-07 18:59:26,214 : INFO : Finished getting training vectors
2016-10-07 18:59:30,392 : INFO : Finished parallelization
2016-10-07 18:59:40,556 : INFO : ===== Loading validation vectors


A


2016-10-07 18:59:45,971 : INFO : Finished getting validation vectors
2016-10-07 18:59:49,503 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 18:59:51,128 : INFO : Finished getting validation vectors
2016-10-07 18:59:54,409 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 18:59:55,965 : INFO : Finished getting validation vectors
2016-10-07 18:59:59,305 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:00:00,834 : INFO : Finished getting validation vectors
2016-10-07 19:00:04,150 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:00:05,860 : INFO : Finished getting validation vectors
2016-10-07 19:00:09,113 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:00:10,774 : INFO : Finished getting validation vectors
2016-10-07 19:00:14,227 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:00:15,894 : INFO : Finished getting validation vectors
2016-10-07 19:00:19,265 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:00:20,943 : INFO : Finished getting validation vectors
2016-10-07 19:00:24,240 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.79431183342, Average No of Labels: 1.35195530726, Top 1: 0.229151014275, Top 3: 0.400826446281, Top 5: 0.611570247934, F1 Micro: 0.250596430957, Total Positive: 7817


2016-10-07 19:00:25,338 : INFO : ****************** Epoch 5 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_5 *******************


A


2016-10-07 19:00:35,914 : INFO : Finished getting training vectors
2016-10-07 19:00:39,769 : INFO : Finished parallelization


B


2016-10-07 19:01:01,679 : INFO : Finished getting training vectors
2016-10-07 19:01:06,091 : INFO : Finished parallelization


C


2016-10-07 19:01:27,172 : INFO : Finished getting training vectors
2016-10-07 19:01:31,385 : INFO : Finished parallelization


D


2016-10-07 19:01:51,645 : INFO : Finished getting training vectors
2016-10-07 19:01:55,898 : INFO : Finished parallelization


E


2016-10-07 19:02:16,753 : INFO : Finished getting training vectors
2016-10-07 19:02:21,141 : INFO : Finished parallelization


F


2016-10-07 19:02:42,697 : INFO : Finished getting training vectors
2016-10-07 19:02:46,880 : INFO : Finished parallelization


G


2016-10-07 19:03:08,812 : INFO : Finished getting training vectors
2016-10-07 19:03:12,879 : INFO : Finished parallelization


H


2016-10-07 19:03:33,820 : INFO : Finished getting training vectors
2016-10-07 19:03:37,738 : INFO : Finished parallelization
2016-10-07 19:03:47,949 : INFO : ===== Loading validation vectors


A


2016-10-07 19:03:53,448 : INFO : Finished getting validation vectors
2016-10-07 19:03:56,835 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:03:58,393 : INFO : Finished getting validation vectors
2016-10-07 19:04:01,845 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:04:03,245 : INFO : Finished getting validation vectors
2016-10-07 19:04:06,634 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:04:08,322 : INFO : Finished getting validation vectors
2016-10-07 19:04:11,679 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:04:13,208 : INFO : Finished getting validation vectors
2016-10-07 19:04:16,678 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:04:19,363 : INFO : Finished getting validation vectors
2016-10-07 19:04:22,652 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:04:24,318 : INFO : Finished getting validation vectors
2016-10-07 19:04:27,959 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:04:29,636 : INFO : Finished getting validation vectors
2016-10-07 19:04:33,113 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.76130015236, Average No of Labels: 1.35195530726, Top 1: 0.226521412472, Top 3: 0.389556724267, Top 5: 0.613448534936, F1 Micro: 0.252782493869, Total Positive: 7940


2016-10-07 19:04:34,143 : INFO : ****************** Epoch 6 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_6 *******************


A


2016-10-07 19:04:44,301 : INFO : Finished getting training vectors
2016-10-07 19:04:48,502 : INFO : Finished parallelization


B


2016-10-07 19:05:09,073 : INFO : Finished getting training vectors
2016-10-07 19:05:13,196 : INFO : Finished parallelization


C


2016-10-07 19:05:33,599 : INFO : Finished getting training vectors
2016-10-07 19:05:37,900 : INFO : Finished parallelization


D


2016-10-07 19:05:58,073 : INFO : Finished getting training vectors
2016-10-07 19:06:01,825 : INFO : Finished parallelization


E


2016-10-07 19:06:23,224 : INFO : Finished getting training vectors
2016-10-07 19:06:26,965 : INFO : Finished parallelization


F


2016-10-07 19:06:47,526 : INFO : Finished getting training vectors
2016-10-07 19:06:51,585 : INFO : Finished parallelization


G


2016-10-07 19:07:12,537 : INFO : Finished getting training vectors
2016-10-07 19:07:16,639 : INFO : Finished parallelization


H


2016-10-07 19:07:37,557 : INFO : Finished getting training vectors
2016-10-07 19:07:41,708 : INFO : Finished parallelization
2016-10-07 19:07:52,095 : INFO : ===== Loading validation vectors


A


2016-10-07 19:07:57,452 : INFO : Finished getting validation vectors
2016-10-07 19:08:00,626 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:08:02,243 : INFO : Finished getting validation vectors
2016-10-07 19:08:05,676 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:08:07,228 : INFO : Finished getting validation vectors
2016-10-07 19:08:10,482 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:08:12,019 : INFO : Finished getting validation vectors
2016-10-07 19:08:15,142 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:08:16,621 : INFO : Finished getting validation vectors
2016-10-07 19:08:19,936 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:08:21,569 : INFO : Finished getting validation vectors
2016-10-07 19:08:24,962 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:08:26,788 : INFO : Finished getting validation vectors
2016-10-07 19:08:30,023 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:08:31,435 : INFO : Finished getting validation vectors
2016-10-07 19:08:34,753 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.69730827831, Average No of Labels: 1.35195530726, Top 1: 0.238542449286, Top 3: 0.399323816679, Top 5: 0.612321562735, F1 Micro: 0.258756525866, Total Positive: 7873


2016-10-07 19:08:35,765 : INFO : ****************** Epoch 7 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_7 *******************


A


2016-10-07 19:08:46,581 : INFO : Finished getting training vectors
2016-10-07 19:08:50,894 : INFO : Finished parallelization


B


2016-10-07 19:09:11,474 : INFO : Finished getting training vectors
2016-10-07 19:09:15,699 : INFO : Finished parallelization


C


2016-10-07 19:09:35,890 : INFO : Finished getting training vectors
2016-10-07 19:09:40,142 : INFO : Finished parallelization


D


2016-10-07 19:09:59,889 : INFO : Finished getting training vectors
2016-10-07 19:10:04,037 : INFO : Finished parallelization


E


2016-10-07 19:10:24,403 : INFO : Finished getting training vectors
2016-10-07 19:10:28,092 : INFO : Finished parallelization


F


2016-10-07 19:10:47,537 : INFO : Finished getting training vectors
2016-10-07 19:10:51,824 : INFO : Finished parallelization


G


2016-10-07 19:11:11,341 : INFO : Finished getting training vectors
2016-10-07 19:11:15,495 : INFO : Finished parallelization


H


2016-10-07 19:11:35,089 : INFO : Finished getting training vectors
2016-10-07 19:11:39,224 : INFO : Finished parallelization
2016-10-07 19:11:48,948 : INFO : ===== Loading validation vectors


A


2016-10-07 19:11:53,817 : INFO : Finished getting validation vectors
2016-10-07 19:11:56,942 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:11:58,411 : INFO : Finished getting validation vectors
2016-10-07 19:12:01,403 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:12:02,972 : INFO : Finished getting validation vectors
2016-10-07 19:12:06,148 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:12:07,639 : INFO : Finished getting validation vectors
2016-10-07 19:12:10,613 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:12:12,139 : INFO : Finished getting validation vectors
2016-10-07 19:12:15,194 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:12:16,787 : INFO : Finished getting validation vectors
2016-10-07 19:12:19,938 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:12:21,564 : INFO : Finished getting validation vectors
2016-10-07 19:12:24,681 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:12:26,380 : INFO : Finished getting validation vectors
2016-10-07 19:12:29,592 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.72320975114, Average No of Labels: 1.35195530726, Top 1: 0.247933884298, Top 3: 0.409090909091, Top 5: 0.611945905334, F1 Micro: 0.25840236123, Total Positive: 7841


2016-10-07 19:12:30,614 : INFO : ****************** Epoch 8 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_8 *******************


A


2016-10-07 19:12:40,747 : INFO : Finished getting training vectors
2016-10-07 19:12:45,174 : INFO : Finished parallelization


B


2016-10-07 19:13:05,306 : INFO : Finished getting training vectors
2016-10-07 19:13:09,228 : INFO : Finished parallelization


C


2016-10-07 19:13:30,434 : INFO : Finished getting training vectors
2016-10-07 19:13:34,537 : INFO : Finished parallelization


D


2016-10-07 19:13:54,889 : INFO : Finished getting training vectors
2016-10-07 19:13:58,741 : INFO : Finished parallelization


E


2016-10-07 19:14:19,114 : INFO : Finished getting training vectors
2016-10-07 19:14:23,282 : INFO : Finished parallelization


F


2016-10-07 19:14:42,842 : INFO : Finished getting training vectors
2016-10-07 19:14:46,768 : INFO : Finished parallelization


G


2016-10-07 19:15:07,887 : INFO : Finished getting training vectors
2016-10-07 19:15:12,226 : INFO : Finished parallelization


H


2016-10-07 19:15:32,664 : INFO : Finished getting training vectors
2016-10-07 19:15:36,850 : INFO : Finished parallelization
2016-10-07 19:15:46,295 : INFO : ===== Loading validation vectors


A


2016-10-07 19:15:51,422 : INFO : Finished getting validation vectors
2016-10-07 19:15:54,869 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:15:56,441 : INFO : Finished getting validation vectors
2016-10-07 19:15:59,393 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:16:00,891 : INFO : Finished getting validation vectors
2016-10-07 19:16:04,146 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:16:05,847 : INFO : Finished getting validation vectors
2016-10-07 19:16:09,344 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:16:10,951 : INFO : Finished getting validation vectors
2016-10-07 19:16:14,198 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:16:15,661 : INFO : Finished getting validation vectors
2016-10-07 19:16:19,093 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:16:20,644 : INFO : Finished getting validation vectors
2016-10-07 19:16:23,985 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:16:25,512 : INFO : Finished getting validation vectors
2016-10-07 19:16:28,882 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.76282376841, Average No of Labels: 1.35195530726, Top 1: 0.242299023291, Top 3: 0.393313298272, Top 5: 0.609316303531, F1 Micro: 0.255889676307, Total Positive: 7780


2016-10-07 19:16:29,992 : INFO : ****************** Epoch 9 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_9 *******************


A


2016-10-07 19:16:40,689 : INFO : Finished getting training vectors
2016-10-07 19:16:45,064 : INFO : Finished parallelization


B


2016-10-07 19:17:06,991 : INFO : Finished getting training vectors
2016-10-07 19:17:11,094 : INFO : Finished parallelization


C


2016-10-07 19:17:31,365 : INFO : Finished getting training vectors
2016-10-07 19:17:35,623 : INFO : Finished parallelization


D


2016-10-07 19:17:56,151 : INFO : Finished getting training vectors
2016-10-07 19:18:00,090 : INFO : Finished parallelization


E


2016-10-07 19:18:20,419 : INFO : Finished getting training vectors
2016-10-07 19:18:24,686 : INFO : Finished parallelization


F


2016-10-07 19:18:45,034 : INFO : Finished getting training vectors
2016-10-07 19:18:49,330 : INFO : Finished parallelization


G


2016-10-07 19:19:11,432 : INFO : Finished getting training vectors
2016-10-07 19:19:15,778 : INFO : Finished parallelization


H


2016-10-07 19:19:35,861 : INFO : Finished getting training vectors
2016-10-07 19:19:40,077 : INFO : Finished parallelization
2016-10-07 19:19:49,839 : INFO : ===== Loading validation vectors


A


2016-10-07 19:19:55,336 : INFO : Finished getting validation vectors
2016-10-07 19:19:58,842 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:20:00,767 : INFO : Finished getting validation vectors
2016-10-07 19:20:04,230 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:20:05,693 : INFO : Finished getting validation vectors
2016-10-07 19:20:09,635 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:20:11,259 : INFO : Finished getting validation vectors
2016-10-07 19:20:14,952 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:20:16,413 : INFO : Finished getting validation vectors
2016-10-07 19:20:20,956 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:20:22,662 : INFO : Finished getting validation vectors
2016-10-07 19:20:25,958 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:20:27,599 : INFO : Finished getting validation vectors
2016-10-07 19:20:31,330 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:20:33,118 : INFO : Finished getting validation vectors
2016-10-07 19:20:37,222 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.75977653631, Average No of Labels: 1.35195530726, Top 1: 0.235161532682, Top 3: 0.396694214876, Top 5: 0.606686701728, F1 Micro: 0.253928945614, Total Positive: 7837


2016-10-07 19:20:38,214 : INFO : ****************** Epoch 10 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_10 *******************


A


2016-10-07 19:20:49,034 : INFO : Finished getting training vectors
2016-10-07 19:20:53,467 : INFO : Finished parallelization


B


2016-10-07 19:21:13,554 : INFO : Finished getting training vectors
2016-10-07 19:21:17,578 : INFO : Finished parallelization


C


2016-10-07 19:21:37,320 : INFO : Finished getting training vectors
2016-10-07 19:21:41,514 : INFO : Finished parallelization


D


2016-10-07 19:22:01,834 : INFO : Finished getting training vectors
2016-10-07 19:22:06,239 : INFO : Finished parallelization


E


2016-10-07 19:22:27,197 : INFO : Finished getting training vectors
2016-10-07 19:22:31,772 : INFO : Finished parallelization


F


2016-10-07 19:22:52,688 : INFO : Finished getting training vectors
2016-10-07 19:22:57,269 : INFO : Finished parallelization


G


2016-10-07 19:23:17,547 : INFO : Finished getting training vectors
2016-10-07 19:23:21,679 : INFO : Finished parallelization


H


2016-10-07 19:23:42,218 : INFO : Finished getting training vectors
2016-10-07 19:23:46,350 : INFO : Finished parallelization
2016-10-07 19:23:57,475 : INFO : ===== Loading validation vectors


A


2016-10-07 19:24:03,043 : INFO : Finished getting validation vectors
2016-10-07 19:24:06,080 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:24:07,838 : INFO : Finished getting validation vectors
2016-10-07 19:24:12,609 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:24:14,167 : INFO : Finished getting validation vectors
2016-10-07 19:24:17,419 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:24:18,972 : INFO : Finished getting validation vectors
2016-10-07 19:24:22,265 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:24:23,810 : INFO : Finished getting validation vectors
2016-10-07 19:24:27,759 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:24:29,523 : INFO : Finished getting validation vectors
2016-10-07 19:24:32,566 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:24:34,173 : INFO : Finished getting validation vectors
2016-10-07 19:24:37,993 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:24:39,539 : INFO : Finished getting validation vectors
2016-10-07 19:24:42,862 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.79024885729, Average No of Labels: 1.35195530726, Top 1: 0.232156273479, Top 3: 0.393313298272, Top 5: 0.619834710744, F1 Micro: 0.258348968105, Total Positive: 7998


2016-10-07 19:24:44,012 : INFO : ****************** Epoch 11 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_11 *******************


A


2016-10-07 19:24:55,507 : INFO : Finished getting training vectors
2016-10-07 19:24:59,679 : INFO : Finished parallelization


B


2016-10-07 19:25:21,581 : INFO : Finished getting training vectors
2016-10-07 19:25:26,117 : INFO : Finished parallelization


C


2016-10-07 19:25:47,437 : INFO : Finished getting training vectors
2016-10-07 19:25:52,010 : INFO : Finished parallelization


D


2016-10-07 19:26:13,514 : INFO : Finished getting training vectors
2016-10-07 19:26:17,883 : INFO : Finished parallelization


E


2016-10-07 19:26:39,102 : INFO : Finished getting training vectors
2016-10-07 19:26:43,242 : INFO : Finished parallelization


F


2016-10-07 19:27:03,530 : INFO : Finished getting training vectors
2016-10-07 19:27:07,208 : INFO : Finished parallelization


G


2016-10-07 19:27:26,853 : INFO : Finished getting training vectors
2016-10-07 19:27:31,241 : INFO : Finished parallelization


H


2016-10-07 19:27:51,311 : INFO : Finished getting training vectors
2016-10-07 19:27:55,122 : INFO : Finished parallelization
2016-10-07 19:28:04,597 : INFO : ===== Loading validation vectors


A


2016-10-07 19:28:09,836 : INFO : Finished getting validation vectors
2016-10-07 19:28:13,034 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:28:14,630 : INFO : Finished getting validation vectors
2016-10-07 19:28:18,137 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:28:19,689 : INFO : Finished getting validation vectors
2016-10-07 19:28:22,861 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:28:24,444 : INFO : Finished getting validation vectors
2016-10-07 19:28:28,037 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:28:29,564 : INFO : Finished getting validation vectors
2016-10-07 19:28:32,660 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:28:34,278 : INFO : Finished getting validation vectors
2016-10-07 19:28:38,188 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:28:39,844 : INFO : Finished getting validation vectors
2016-10-07 19:28:43,764 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:28:45,346 : INFO : Finished getting validation vectors
2016-10-07 19:28:49,131 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.74606399187, Average No of Labels: 1.35195530726, Top 1: 0.231404958678, Top 3: 0.405334335086, Top 5: 0.610443275733, F1 Micro: 0.256934582978, Total Positive: 7901


2016-10-07 19:28:50,180 : INFO : ****************** Epoch 12 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_12 *******************


A


2016-10-07 19:29:00,447 : INFO : Finished getting training vectors
2016-10-07 19:29:04,708 : INFO : Finished parallelization


B


2016-10-07 19:29:27,092 : INFO : Finished getting training vectors
2016-10-07 19:29:31,465 : INFO : Finished parallelization


C


2016-10-07 19:29:51,465 : INFO : Finished getting training vectors
2016-10-07 19:29:55,932 : INFO : Finished parallelization


D


2016-10-07 19:30:16,010 : INFO : Finished getting training vectors
2016-10-07 19:30:20,175 : INFO : Finished parallelization


E


2016-10-07 19:30:40,502 : INFO : Finished getting training vectors
2016-10-07 19:30:44,777 : INFO : Finished parallelization


F


2016-10-07 19:31:05,116 : INFO : Finished getting training vectors
2016-10-07 19:31:09,384 : INFO : Finished parallelization


G


2016-10-07 19:31:29,928 : INFO : Finished getting training vectors
2016-10-07 19:31:34,452 : INFO : Finished parallelization


H


2016-10-07 19:31:54,890 : INFO : Finished getting training vectors
2016-10-07 19:31:59,161 : INFO : Finished parallelization
2016-10-07 19:32:09,762 : INFO : ===== Loading validation vectors


A


2016-10-07 19:32:16,447 : INFO : Finished getting validation vectors
2016-10-07 19:32:19,914 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:32:21,445 : INFO : Finished getting validation vectors
2016-10-07 19:32:25,161 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:32:26,698 : INFO : Finished getting validation vectors
2016-10-07 19:32:30,630 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:32:32,285 : INFO : Finished getting validation vectors
2016-10-07 19:32:36,356 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:32:37,986 : INFO : Finished getting validation vectors
2016-10-07 19:32:41,556 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:32:43,237 : INFO : Finished getting validation vectors
2016-10-07 19:32:46,962 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:32:48,539 : INFO : Finished getting validation vectors
2016-10-07 19:32:52,171 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:32:53,726 : INFO : Finished getting validation vectors
2016-10-07 19:32:57,168 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.76841036059, Average No of Labels: 1.35195530726, Top 1: 0.214500375657, Top 3: 0.387302779865, Top 5: 0.601803155522, F1 Micro: 0.253707376972, Total Positive: 7925


2016-10-07 19:32:58,163 : INFO : ****************** Epoch 13 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_13 *******************


A


2016-10-07 19:33:08,672 : INFO : Finished getting training vectors
2016-10-07 19:33:12,875 : INFO : Finished parallelization


B


2016-10-07 19:33:34,291 : INFO : Finished getting training vectors
2016-10-07 19:33:38,642 : INFO : Finished parallelization


C


2016-10-07 19:33:58,745 : INFO : Finished getting training vectors
2016-10-07 19:34:03,084 : INFO : Finished parallelization


D


2016-10-07 19:34:22,772 : INFO : Finished getting training vectors
2016-10-07 19:34:27,276 : INFO : Finished parallelization


E


2016-10-07 19:34:46,330 : INFO : Finished getting training vectors
2016-10-07 19:34:50,648 : INFO : Finished parallelization


F


2016-10-07 19:35:10,140 : INFO : Finished getting training vectors
2016-10-07 19:35:14,620 : INFO : Finished parallelization


G


2016-10-07 19:35:34,548 : INFO : Finished getting training vectors
2016-10-07 19:35:39,164 : INFO : Finished parallelization


H


2016-10-07 19:35:59,100 : INFO : Finished getting training vectors
2016-10-07 19:36:03,074 : INFO : Finished parallelization
2016-10-07 19:36:11,857 : INFO : ===== Loading validation vectors


A


2016-10-07 19:36:16,685 : INFO : Finished getting validation vectors
2016-10-07 19:36:19,609 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:36:22,388 : INFO : Finished getting validation vectors
2016-10-07 19:36:25,324 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:36:26,852 : INFO : Finished getting validation vectors
2016-10-07 19:36:29,827 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:36:31,354 : INFO : Finished getting validation vectors
2016-10-07 19:36:34,284 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:36:35,902 : INFO : Finished getting validation vectors
2016-10-07 19:36:38,906 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:36:40,427 : INFO : Finished getting validation vectors
2016-10-07 19:36:43,373 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:36:44,964 : INFO : Finished getting validation vectors
2016-10-07 19:36:48,889 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:36:50,418 : INFO : Finished getting validation vectors
2016-10-07 19:36:53,997 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.78720162519, Average No of Labels: 1.35195530726, Top 1: 0.225770097671, Top 3: 0.393313298272, Top 5: 0.605935386927, F1 Micro: 0.252175558078, Total Positive: 7910


2016-10-07 19:36:55,234 : INFO : ****************** Epoch 14 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_14 *******************


A


2016-10-07 19:37:05,864 : INFO : Finished getting training vectors
2016-10-07 19:37:10,174 : INFO : Finished parallelization


B


2016-10-07 19:37:29,680 : INFO : Finished getting training vectors
2016-10-07 19:37:33,968 : INFO : Finished parallelization


C


2016-10-07 19:37:54,181 : INFO : Finished getting training vectors
2016-10-07 19:37:58,752 : INFO : Finished parallelization


D


2016-10-07 19:38:18,252 : INFO : Finished getting training vectors
2016-10-07 19:38:22,215 : INFO : Finished parallelization


E


2016-10-07 19:38:41,958 : INFO : Finished getting training vectors
2016-10-07 19:38:46,389 : INFO : Finished parallelization


F


2016-10-07 19:39:06,464 : INFO : Finished getting training vectors
2016-10-07 19:39:10,604 : INFO : Finished parallelization


G


2016-10-07 19:39:29,987 : INFO : Finished getting training vectors
2016-10-07 19:39:34,333 : INFO : Finished parallelization


H


2016-10-07 19:39:53,961 : INFO : Finished getting training vectors
2016-10-07 19:39:58,372 : INFO : Finished parallelization
2016-10-07 19:40:07,185 : INFO : ===== Loading validation vectors


A


2016-10-07 19:40:12,549 : INFO : Finished getting validation vectors
2016-10-07 19:40:15,571 : INFO : Loaded the model, Doing the prediction now....


B


2016-10-07 19:40:17,195 : INFO : Finished getting validation vectors
2016-10-07 19:40:20,789 : INFO : Loaded the model, Doing the prediction now....


C


2016-10-07 19:40:22,317 : INFO : Finished getting validation vectors
2016-10-07 19:40:25,872 : INFO : Loaded the model, Doing the prediction now....


D


2016-10-07 19:40:27,242 : INFO : Finished getting validation vectors
2016-10-07 19:40:30,310 : INFO : Loaded the model, Doing the prediction now....


E


2016-10-07 19:40:31,879 : INFO : Finished getting validation vectors
2016-10-07 19:40:35,452 : INFO : Loaded the model, Doing the prediction now....


F


2016-10-07 19:40:37,021 : INFO : Finished getting validation vectors
2016-10-07 19:40:40,844 : INFO : Loaded the model, Doing the prediction now....


G


2016-10-07 19:40:42,426 : INFO : Finished getting validation vectors
2016-10-07 19:40:45,643 : INFO : Loaded the model, Doing the prediction now....


H


2016-10-07 19:40:47,146 : INFO : Finished getting validation vectors
2016-10-07 19:40:50,632 : INFO : Loaded the model, Doing the prediction now....


Coverage Error: 6.81005586592, Average No of Labels: 1.35195530726, Top 1: 0.232156273479, Top 3: 0.410593538693, Top 5: 0.620586025545, F1 Micro: 0.257107303532, Total Positive: 7785


2016-10-07 19:40:51,725 : INFO : ****************** Epoch 15 --- Working on doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_15 *******************
2016-10-07 19:40:51,727 : INFO : training model with 20 workers on 104358 vocabulary and 51000 features, using sg=0 hs=1 sample=1e-05 negative=0
2016-10-07 19:40:51,728 : INFO : expecting 8979 sentences, matching count from corpus used for vocabulary survey
Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/gensim/models/word2vec.py", line 756, in worker_loop
    tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages

In [37]:
epoch

15

In [60]:
 doc2vec_model.save(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))

2016-09-24 17:07:46,689 : INFO : saving Doc2Vec object under /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_12_type_dm_concat_1_mean_0_trainwords_1_hs_0_neg_10_vocabsize_None_curriter_9/model, separately None
2016-09-24 17:07:46,691 : INFO : storing numpy array 'doctag_syn0' to /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_12_type_dm_concat_1_mean_0_trainwords_1_hs_0_neg_10_vocabsize_None_curriter_9/model.docvecs.doctag_syn0.npy
2016-09-24 17:07:48,636 : INFO : storing numpy array 'syn1neg' to /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_12_type_dm_concat_1_mean_0_trainwords_1_hs_0_neg_10_vocabsize_None_curriter_9/model.syn1neg.npy
2016-09-24 17:13:31,627 : INFO : not storing attribute syn0norm
2016-09-24 17:13:31,629 : INFO : storing numpy array 'syn0' to /big/s/shalaby/parameter_search_doc2vec_models/sample_0.0001/doc2vec_size_3000_w_12_type_dm_concat_1_mean_0_trainwords_1_hs_0_ne

In [39]:
GLOBAL_VARS.MODEL_NAME

'doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_15'

## Plot loaded metrics

In [40]:
%matplotlib notebook
graph = MetricsGraph()
graph.init_graph()
print placeholder_model_name + "_" + GLOBAL_VARS.SVM_MODEL_NAME

for epoch in range(1,DOC2VEC_MAX_EPOCHS+1):
    try:
        model_name = placeholder_model_name.format(epoch)
        metrics = pickle.load(open(os.path.join(doc2vec_model_save_location, model_name, GLOBAL_VARS.SVM_MODEL_NAME, METRICS)))
        graph.add_metrics_to_graph(metrics, epoch)
    except IOError:
        break

<IPython.core.display.Javascript object>

doc2vec_size_3000_w_8_type_dm_concat_1_mean_0_trainwords_0_hs_1_neg_0_vocabsize_None_curriter_{}_iter_1000_reg_0.001


In [118]:
[metric['f1_micro'] for metric in epoch_metrics]

[0.54857142857142849,
 0.5620838229533881,
 0.54473738841602659,
 0.34408602150537637,
 0.32672983060579963]