In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random

from multiprocessing.dummy import Pool as ThreadPool
import itertools
from collections import defaultdict

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

import keras
from keras.layers import Input, Dense, Dropout
from keras.models import Model

from thesis.utils.metrics import *

Using gpu device 0: Tesla K40m (CNMeM is disabled, cuDNN 5105)
Using Theano backend.


In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
IS_SAMPLE = False

In [4]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234
WORD2VEC_SEED = 1234

In [5]:
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 100
MIN_SIZE = 0
NUM_CORES = 14

In [6]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [7]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"

In [8]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"

root_location = "/mnt/data2/shalaby/"
exports_location = root_location + "exported_data/"

doc2vec_model_save_location = os.path.join(root_location, "parameter_search_doc2vec_models_new", "full")
nn_parameter_search_location = os.path.join(root_location, "nn_parameter_search")
if not os.path.exists(doc2vec_model_save_location):
    os.makedirs(doc2vec_model_save_location)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
    os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
classification_index_file = exports_location + "classification_index.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/"

training_preprocessed_files_prefix = preprocessed_location + "training_docs_merged_data_preprocessed-"
training_preprocessed_docids_files_prefix = preprocessed_location + "training_docs_merged_docids_preprocessed-"
validation_preprocessed_files_prefix = preprocessed_location + "validation_docs_merged_data_preprocessed-"
validation_preprocessed_docids_files_prefix = preprocessed_location + "validation_docs_merged_docids_preprocessed-"

training_preprocessed_additional_balanced_file = preprocessed_location + "training_docs_additional_data_preprocessed"
validation_preprocessed_additional_balanced_file = preprocessed_location + "validation_docs_additional_data_preprocessed"

word2vec_questions_file = result = root_location + 'tensorflow/word2vec/questions-words.txt'

In [9]:
training_docs_additional_list_file = exports_location + "balanced_additional_training_docs_list.pkl"
validation_docs_additional_list_file = exports_location + "balanced_additional_validation_docs_list.pkl"

In [10]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
classifications_index = pickle.load(open(classification_index_file))
#test_docs_list = pickle.load(open(test_docs_list_file))

additional_training_docs_list = pickle.load(open(training_docs_additional_list_file))
additional_validation_docs_list = pickle.load(open(validation_docs_additional_list_file))

CPU times: user 24.4 s, sys: 1.13 s, total: 25.5 s
Wall time: 25.5 s


In [11]:
def ensure_hdfs_location_exists(location):
    parent = os.path.dirname(location)
    os.system("hdfs dfs -mkdir -p " + location)

def ensure_disk_location_exists(location):
    if not os.path.exists(location):
        os.makedirs(location)

In [12]:
def get_validation_docs_with_inference(doc2vec_model, doc_classification_map):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
    else:
        validation_documents_reps = {}
        validation_vectors = []
        validation_labels = []
        info("===== Getting validation vectors with inference")

        # do inference and store results in dict
        i = 0
        for (doc_id, doc_contents_array) in ValidationDocumentGenerator(training_file, validation_docs_list):
            i += 1
            if i % 1000 == 0: info("Finished: {}".format(str(i)))
            validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)

        # create matrix for the validation vectors
        for validation_doc_id in validation_docs_list:
            validation_vectors.append(validation_documents_reps[validation_doc_id])
            validation_labels.append([classf for classf in doc_classification_map[validation_doc_id] if classf in sections])
        validation_vectors_matrix = np.array(validation_vectors)
        pickle.dump(validation_vectors_matrix, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX), 'w'))
    
    return validation_vectors_matrix

In [13]:
def get_validation_docs_with_inference_new(doc2vec_model, doc_classification_map, classifications, 
                                           val_docs_list, val_preprocessed_files_prefix, val_preprocessed_docids_files_prefix):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """

    def infer_one_doc(doc_tuple):
        #doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
        doc_id, doc_tokens = doc_tuple
        rep = doc2vec_model.infer_vector(doc_tokens)
        return (doc_id, rep)

    one_hot_encoder = OneHotEncoder(classifications)
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_labels = []
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
        for validation_doc_id in val_docs_list:
            val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
            validation_labels.append(one_hot_encoder.get_label_vector(val_labels))
        validation_labels = np.array(validation_labels)
    else:
        validation_documents_reps = {}
        validation_vectors = []
        validation_labels = []
        info("===== Getting validation vectors with inference")

        # Single-threaded inference
        # do inference and store results in dict
#         i = 0
        
#         validation_docs_iterator = DocumentBatchGenerator(val_preprocessed_files_prefix, 
#                                                         val_preprocessed_docids_files_prefix, batch_size=None)
#         for (doc_id, doc_contents_array) in validation_docs_iterator:
#             i += 1
#             if i % 1000 == 0: info("Finished: {}".format(str(i)))
#             validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)
        
        # Multi-threaded inference
        validation_docs_iterator = DocumentBatchGenerator(validation_preprocessed_files_prefix, 
                                                          validation_preprocessed_docids_files_prefix, batch_size=None)
        generator_func = validation_docs_iterator.__iter__()
        pool = ThreadPool(NUM_CORES)
        # map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
        validation_documents_reps = {}
        mini_batch_size = 1000
        while True:
            threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
            info("Finished: {}".format(str(validation_docs_iterator.curr_index)))
            if threaded_reps_partial:
                #threaded_reps.extend(threaded_reps_partial)
                validation_documents_reps.update(threaded_reps_partial)
            else:
                break

                
        # create matrix for the validation vectors
        for validation_doc_id in val_docs_list:
            validation_vectors.append(validation_documents_reps[validation_doc_id])
            val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
            validation_labels.append(one_hot_encoder.get_label_vector(val_labels))
        validation_vectors_matrix = np.array(validation_vectors)
        validation_labels = np.array(validation_labels)
        pickle.dump(validation_vectors_matrix, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX), 'w'))
    
    return validation_vectors_matrix, validation_labels

In [14]:
def get_validation_docs_with_inference_new_additional(doc2vec_model, doc_classification_map, classifications, 
                                           val_docs_list, val_preprocessed_additional_file, val_preprocessed_additional_docids):
    """
    Use the trained doc2vec model to get the paragraph vector representations of the validation documents
    """

    def infer_one_doc(doc_tuple):
        #doc2vec_model.random = np.random.RandomState(DOC2VEC_SEED)
        doc_id, doc_tokens = doc_tuple
        rep = doc2vec_model.infer_vector(doc_tokens)
        return (doc_id, rep)

    # First load the already computed vectors
    one_hot_encoder = OneHotEncoder(classifications)
    if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)):
        info("===== Loading validation vectors")
        validation_labels = []
        validation_vectors_matrix = pickle.load(open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_MATRIX)))
        for validation_doc_id in val_docs_list:
            val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
            validation_labels.append(one_hot_encoder.get_label_vector(val_labels))
        validation_labels = np.array(validation_labels)
        
    
    # Now infer the additionals
    validation_documents_reps_additional = {}
    validation_vectors_additional = []
    validation_labels_additional = []
    info("===== Getting validation vectors with inference")

    # Multi-threaded inference
    validation_docs_iterator = AdditionalBalancedDocumentGenerator(val_preprocessed_additional_file, 
                                                      val_preprocessed_additional_docids, batch_size=None)
    generator_func = validation_docs_iterator.__iter__()
    pool = ThreadPool(NUM_CORES)
    # map consumes the whole iterator on the spot, so we have to use itertools.islice to fake mini-batching
    validation_documents_reps = {}
    mini_batch_size = 100
    while True:
        threaded_reps_partial = pool.map(infer_one_doc, itertools.islice(generator_func, mini_batch_size))
        info("Finished: {}".format(str(mini_batch_size)))
        if threaded_reps_partial:
            #threaded_reps.extend(threaded_reps_partial)
            validation_documents_reps_additional.update(threaded_reps_partial)
        else:
            break

    # create matrix for the additional validation vectors
    for validation_doc_id in val_preprocessed_additional_docids:
        validation_vectors_additional.append(validation_documents_reps_additional[validation_doc_id])
        val_labels = [classf for classf in doc_classification_map[validation_doc_id] if classf in classifications]
        validation_labels_additional.append(one_hot_encoder.get_label_vector(val_labels))
    validation_vectors_matrix_additional = np.array(validation_vectors_additional)
    validation_labels_additional = np.array(validation_labels_additional)

    # stack the old validation vectors matrix and labels with the additional ones
    validation_vectors_matrix = np.vstack((validation_vectors_matrix, validation_vectors_matrix_additional))
    validation_labels = np.vstack((validation_labels, validation_labels_additional))
    
    
    return validation_vectors_matrix, validation_labels

In [15]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_training_data(doc2vec_model, classifications):
    one_hot_encoder = OneHotEncoder(classifications)
    training_data = []
    training_labels = []
    for doc_id in training_docs_list:
        # converting from memmap to a normal array
        normal_array = []
        normal_array[:] = doc2vec_model.docvecs[doc_id][:]
        training_data.append(normal_array)
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        training_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
    training_labels = np.array(training_labels)
    training_data = np.array(training_data)
    return training_data, training_labels

def get_training_data_with_additional(doc2vec_model, classifications):
    one_hot_encoder = OneHotEncoder(classifications)
    training_data = []
    training_labels = []
    for doc_id in training_docs_list + additional_training_docs_list:
        # converting from memmap to a normal array
        normal_array = []
        normal_array[:] = doc2vec_model.docvecs[doc_id][:]
        training_data.append(normal_array)
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        training_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
    training_labels = np.array(training_labels)
    training_data = np.array(training_data)
    return training_data, training_labels

In [16]:
class TrainingDocumentGenerator(object):
    def __init__(self, filename, training_docs_list):
        self.filename = filename
        self.training_docs_list = training_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.training_docs_list:
                    yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
                    
class DocumentBatchGenerator(object):
    def __init__(self, filename_prefix, filename_docids_prefix, batch_size=10000 ):
        """
        batch_size cant be > 10,000 due to a limitation in doc2vec training, 
        None means no batching (only use for inference)
        """
        assert batch_size <= 10000 or batch_size is None
        self.filename_prefix = filename_prefix
        self.filename_docids_prefix = filename_docids_prefix
        self.curr_lines = []
        self.curr_docids = []
        self.batch_size = batch_size
        self.curr_index = 0
        self.batch_end = -1
    def load_new_batch_in_memory(self):
        del self.curr_lines, self.curr_docids
        self.curr_lines, self.docids = [], []
        info("Loading new batch for index: {}".format(self.curr_index) )
        try:
            with open(self.filename_prefix + str(self.curr_index)) as preproc_file:
                for line in preproc_file:
                    self.curr_lines.append(line.split(" "))
#                     if i % 1000 == 0:
#                         print i
            self.curr_docids = pickle.load(open(self.filename_docids_prefix + str(self.curr_index), "r"))
            self.batch_end = self.curr_index + len(self.curr_lines) -1 
            info("Finished loading new batch")
        except IOError:
            info("No more batches to load, exiting at index: {}".format(self.curr_index))
            raise StopIteration()
    def __iter__(self):
        while True:
            if self.curr_index > self.batch_end:
                self.load_new_batch_in_memory()
            for (doc_id, tokens) in zip(self.curr_docids, self.curr_lines):
                if self.batch_size is not None:
                    curr_batch_iter = 0
                    # divide the document to batches according to the batch size
                    while curr_batch_iter < len(tokens):
                        yield LabeledSentence(words=tokens[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id])
                        curr_batch_iter += self.batch_size
                else:
                    yield doc_id, tokens
                self.curr_index += 1

class AdditionalBalancedDocumentGenerator(object):
    def __init__(self, filename, docids, batch_size=10000):
        self.filename = filename
        self.curr_docids = docids
        self.batch_size = batch_size
        self.curr_lines = []
        self.num_total_batches = 0
        self.load_file_in_memory()
        
    def load_file_in_memory(self):
        with open(self.filename) as preproc_file:
            for line in preproc_file:
                tokens = line.split(" ")
                self.curr_lines.append(tokens)
                # get the number of total batches as this will be used when training for Doc2vec to calculate the learning rate decay (which we dont do, but is still needed)
                if self.batch_size is not None:
                    self.num_total_batches += math.ceil(float(len(tokens))/ self.batch_size)
                else:
                    self.num_total_batches += 1
                
    def __iter__(self):
        for (doc_id, tokens) in zip(self.curr_docids, self.curr_lines):
            if self.batch_size is not None:
                curr_batch_iter = 0
                # divide the document to batches according to the batch size
                while curr_batch_iter < len(tokens):
                    yield LabeledSentence(words=tokens[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id])
                    curr_batch_iter += self.batch_size
            else:
                yield doc_id, tokens
                
class Word2VecTrainingDocumentGenerator(object):
    def __init__(self, filename, training_docs_list):
        self.filename = filename
        self.training_docs_list = training_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.training_docs_list:
                    yield stemtokenizer(text)
                
class ValidationDocumentGenerator(object):
    def __init__(self, filename, validation_docs_list):
        self.filename = filename
        self.validation_docs_list = validation_docs_list
    def __iter__(self):
        with open(self.filename) as file_obj:
            for line in file_obj:
                if not line.strip(): continue
                (doc_id, text) = eval(line)
                if doc_id in self.validation_docs_list:
                    yield doc_id, stemtokenizer(text)
                    
class StochasticDocumentGenerator(object):
    """
    Randomly shuffle rows while reading them
    """
    def __init__(self, filename, training_docs_list, line_positions):
        self.filename = filename
        self.training_docs_list = training_docs_list
        self.line_positions = line_positions
        self.lines = set(line_positions.keys())
    def __iter__(self):
        with open(self.filename) as file_obj:
            while len(self.lines) > 0:
                random_line = random.sample(self.lines,1)[0]
                self.lines.remove(random_line)
                file_obj.seek(self.line_positions[random_line])
                line = file_obj.readline()
                if not line.strip(): continue
#                 print random_line, self.line_positions[random_line], line[:30]
                (doc_id, text) = eval(line)
                # print random_line , doc_id
                if doc_id in self.training_docs_list:
                    yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
#                     yield doc_id


In [17]:
class ClassificationMetricsCallback(keras.callbacks.Callback):
    
    EPOCHS_BEFORE_VALIDATION = 10
    
    def on_train_begin(self, logs={}):
        self.epoch_index = 0
        self.val_loss_reductions = 0
        self.best_val_loss = np.iinfo(np.int32).max
        self.best_weights = None
    def on_epoch_end(self, epoch, logs={}):
        self.epoch_index += 1
        if logs['val_loss'] < self.best_val_loss:
            self.val_loss_reductions += 1
            self.best_val_loss = logs['val_loss']
            self.best_weights = self.model.get_weights()
            print '\r    \r' # to remove the previous line of verbose output of model fit
            time.sleep(0.2)
            info('Found lower val loss for epoch {} => {}'.format(self.epoch_index, round(logs['val_loss'], 5)))

            
def create_keras_nn_model(input_size, output_size, 
                          first_hidden_layer_size, first_hidden_layer_activation, 
                          second_hidden_layer_size, second_hidden_layer_activation, 
                          input_dropout_do, hidden_dropout_do):
    
    doc_input = Input(shape=(DOC2VEC_SIZE,), name='doc_input')
    if input_dropout_do:
        hidden = Dropout(0.7)(doc_input)
    hidden = Dense(first_hidden_layer_size, activation=first_hidden_layer_activation, 
                   name='hidden_layer_{}'.format(first_hidden_layer_activation))(doc_input if not input_dropout_do else hidden)
    if hidden_dropout_do:
        hidden = Dropout(0.5)(hidden)
    if second_hidden_layer_size is not None:
        hidden = Dense(second_hidden_layer_size, activation=second_hidden_layer_activation, 
                       name='hidden_layer2_{}'.format(second_hidden_layer_activation))(hidden)
    softmax_output = Dense(output_size, activation='sigmoid', name='softmax_output')(hidden)

    model = Model(input=doc_input, output=softmax_output)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy')
    
    return model


get_binary_0_5 = lambda x: 1 if x > 0.5 else 0
get_binary_0_5 = np.vectorize(get_binary_0_5)

In [18]:
DOC2VEC_SIZE = 100
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 0
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 1
DOC2VEC_MEAN = 0
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 20
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 10000 # report vocab progress every x documents

In [19]:
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE, 
                                                                DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE))
GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
placeholder_model_name

'doc2vec_size_100_w_8_type_pv-dbow_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_{}'

In [20]:
classifications = valid_subclasses
classifications_type = 'subclasses'

In [21]:
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'.format(classifications_type)
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'.format(classifications_type)
METRICS_FIG_PNG_FILENAME = '{}_validation_metrics.png'.format(classifications_type)
METRICS_FIG_PDF_FILENAME = '{}_validation_metrics.pdf'.format(classifications_type)
WORD2VEC_METRICS_FILENAME = 'word2vec_metrics.pkl'

### Load Doc2vec Model

In [22]:
epoch = 7
GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
info("****************** Epoch {} --- Loading {} *******************".format(epoch, GLOBAL_VARS.MODEL_NAME))

# if we have the model, just load it, otherwise train the previous model
if os.path.exists(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX)):
    doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX))
    GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

info('Getting training Data')
X, y = get_training_data(doc2vec_model, classifications)

info('Getting Validation Embeddings')
Xv, yv = get_validation_docs_with_inference_new(doc2vec_model, doc_classification_map, classifications, 
                                                validation_docs_list, validation_preprocessed_files_prefix,
                                                validation_preprocessed_docids_files_prefix)

2017-01-28 01:02:35,375 : INFO : ****************** Epoch 7 --- Loading doc2vec_size_100_w_8_type_pv-dbow_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7 *******************
2017-01-28 01:02:35,380 : INFO : loading Doc2Vec object from /mnt/data2/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_100_w_8_type_pv-dbow_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model
2017-01-28 01:02:43,217 : INFO : loading docvecs recursively from /mnt/data2/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_100_w_8_type_pv-dbow_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model.docvecs.* with mmap=None
2017-01-28 01:02:43,219 : INFO : loading doctag_syn0 from /mnt/data2/shalaby/parameter_search_doc2vec_models_new/full/doc2vec_size_100_w_8_type_pv-dbow_concat_1_mean_0_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_7/model.docvecs.doctag_syn0.npy with mmap=None
2017-01-28 01:02:44,333 : INFO : loading syn1neg from /mnt/data2/shal

### NN Parameters

In [23]:
NN_OUTPUT_NEURONS = len(classifications)
EARLY_STOPPER_MIN_DELTA = 0.00001
EARLY_STOPPER_PATIENCE = 5

NN_OUTPUT_NEURONS = len(classifications)
NN_MAX_EPOCHS = 100
NN_BATCH_SIZE = 1024
NN_RANDOM_SEARCH_BUDGET = 20
NN_PARAM_SAMPLE_SEED = 1234

In [24]:
start_time = time.time()
first_hidden_layer_size = 200
first_hidden_layer_activation = 'tanh'
second_hidden_layer_size = 1000
second_hidden_layer_activation = 'relu'
input_dropout_do = False
hidden_dropout_do = True

#     print "===================================================================================\n" + \
#           "========== 1st Layer Size: {}, 1st Layer Activation: {}, \n 2nd Layer Size: {}, 2nd Layer Activation: {}, \n" + \
#           "Input Dropout: {}, Hidden Dropout: {} \n" + \
#           "==========================".format(first_hidden_layer_size, first_hidden_layer_activation, 
#                                                 second_hidden_layer_size, second_hidden_layer_activation, 
#                                                 input_dropout_do, hidden_dropout_do)

GLOBAL_VARS.NN_MODEL_NAME = 'nn_1st-size_{}_1st-act_{}_2nd-size_{}_2nd-act_{}_in-drop_{}_hid-drop_{}'.format(
    first_hidden_layer_size, first_hidden_layer_activation, second_hidden_layer_size, 
    second_hidden_layer_activation, input_dropout_do, hidden_dropout_do
)

info('***************************************************************************************')
info(GLOBAL_VARS.NN_MODEL_NAME)

model = create_keras_nn_model(DOC2VEC_SIZE, NN_OUTPUT_NEURONS, 
                              first_hidden_layer_size, first_hidden_layer_activation, 
                              second_hidden_layer_size, second_hidden_layer_activation, 
                              input_dropout_do, hidden_dropout_do)
model.summary()

early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, \
                                              patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto')
metrics_callback = ClassificationMetricsCallback()

# Model Fitting
%time history = model.fit(x=X, y=y, validation_data=(Xv,yv), batch_size=NN_BATCH_SIZE, \
                          nb_epoch=NN_MAX_EPOCHS, verbose=1, callbacks=[early_stopper, metrics_callback])

# info('Evaluating on Training Data')
# yp = model.predict(X, batch_size=NN_BATCH_SIZE)
# yp_binary = get_binary_0_5(yp)
# #print yp
# info('Generating Training Metrics')
# training_metrics = get_metrics(y, yp, yp_binary)
# print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
#     training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
#     training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
#     training_metrics['f1_micro'],training_metrics['f1_macro'],  training_metrics['total_positive'])

info('Evaluating on Validation Data using last weights')
yvp = model.predict(Xv)
yvp_binary = get_binary_0_5(yvp)
#print yvp
info('Generating Validation Metrics')
validation_metrics = get_metrics(yv, yvp, yvp_binary)
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])
last_validation_metrics = validation_metrics

# using the recorded weights of the best recorded validation loss
last_model_weights = model.get_weights()
info('Evaluating on Validation Data using saved best weights')
model.set_weights(metrics_callback.best_weights)
yvp = model.predict(Xv)
yvp_binary = get_binary_0_5(yvp)
#print yvp
info('Generating Validation Metrics')
validation_metrics = get_metrics(yv, yvp, yvp_binary)
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])
best_validation_metrics = validation_metrics

duration = time.time() - start_time


2017-01-28 01:06:08,323 : INFO : ***************************************************************************************
2017-01-28 01:06:08,324 : INFO : nn_1st-size_200_1st-act_tanh_2nd-size_1000_2nd-act_relu_in-drop_False_hid-drop_True


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
doc_input (InputLayer)           (None, 100)           0                                            
____________________________________________________________________________________________________
hidden_layer_tanh (Dense)        (None, 200)           20200       doc_input[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 200)           0           hidden_layer_tanh[0][0]          
____________________________________________________________________________________________________
hidden_layer2_relu (Dense)       (None, 1000)          201000      dropout_1[0][0]                  
___________________________________________________________________________________________

2017-01-28 01:06:46,172 : INFO : Found lower val loss for epoch 1 => 0.00449


Epoch 2/100
    


2017-01-28 01:07:20,908 : INFO : Found lower val loss for epoch 2 => 0.00428


Epoch 3/100
    


2017-01-28 01:07:54,956 : INFO : Found lower val loss for epoch 3 => 0.00414


Epoch 4/100
    


2017-01-28 01:08:30,419 : INFO : Found lower val loss for epoch 4 => 0.004


Epoch 5/100
    


2017-01-28 01:09:03,391 : INFO : Found lower val loss for epoch 5 => 0.00396


Epoch 6/100
    


2017-01-28 01:09:37,720 : INFO : Found lower val loss for epoch 6 => 0.00396


Epoch 7/100
    


2017-01-28 01:10:12,590 : INFO : Found lower val loss for epoch 7 => 0.00394


Epoch 8/100
Epoch 9/100
    


2017-01-28 01:11:21,594 : INFO : Found lower val loss for epoch 9 => 0.00383


Epoch 10/100
Epoch 11/100
Epoch 12/100
    


2017-01-28 01:13:06,778 : INFO : Found lower val loss for epoch 12 => 0.00382


Epoch 13/100
Epoch 14/100
Epoch 15/100

2017-01-28 01:14:47,014 : INFO : Evaluating on Validation Data using last weights



Epoch 00014: early stopping
CPU times: user 3min 55s, sys: 5min 54s, total: 9min 50s
Wall time: 8min 37s


2017-01-28 01:15:53,630 : INFO : Generating Validation Metrics
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
2017-01-28 01:23:12,922 : INFO : Evaluating on Validation Data using saved best weights


****** Validation Metrics: Cov Err: 8.777 | Top 3: 0.700 | Top 5: 0.785 | F1 Micro: 0.456 | F1 Macro: 0.069


2017-01-28 01:24:22,307 : INFO : Generating Validation Metrics


****** Validation Metrics: Cov Err: 8.728 | Top 3: 0.702 | Top 5: 0.787 | F1 Micro: 0.463 | F1 Macro: 0.071


In [25]:
additional_iterator = AdditionalBalancedDocumentGenerator(training_preprocessed_additional_balanced_file, 
                                                          additional_training_docs_list, batch_size=10000)

In [26]:
len(additional_iterator.curr_lines)

783

In [27]:
additional_iterator.num_total_batches

969.0

In [28]:
doc2vec_model.alpha

0.017999999999999995

In [29]:
doc2vec_model.alpha = 0.025
doc2vec_model.min_alpha = 0.025
for i in range(8):
    doc2vec_model.train(sentences=additional_iterator, total_examples=additional_iterator.num_total_batches, report_delay=REPORT_DELAY)
    doc2vec_model.alpha -= 0.001  # decrease the learning rate
    doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay

2017-01-28 01:32:45,762 : INFO : training model with 20 workers on 391521 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=10
2017-01-28 01:32:47,354 : INFO : PROGRESS: at 0.10% examples, 5583 words/s, in_qsize 2, out_qsize 3
2017-01-28 01:32:56,283 : INFO : worker thread finished; awaiting finish of 19 more threads
2017-01-28 01:32:56,476 : INFO : worker thread finished; awaiting finish of 18 more threads
2017-01-28 01:32:56,489 : INFO : worker thread finished; awaiting finish of 17 more threads
2017-01-28 01:32:56,490 : INFO : worker thread finished; awaiting finish of 16 more threads
2017-01-28 01:32:56,491 : INFO : worker thread finished; awaiting finish of 15 more threads
2017-01-28 01:32:56,492 : INFO : worker thread finished; awaiting finish of 14 more threads
2017-01-28 01:32:56,492 : INFO : worker thread finished; awaiting finish of 13 more threads
2017-01-28 01:32:56,493 : INFO : worker thread finished; awaiting finish of 12 more threads
2017-01-28 01:32:56,

### Now get the augmented training and validation data

In [30]:
info('Getting training Data with additionals')
X_add, y_add = get_training_data_with_additional(doc2vec_model, classifications)

info('Getting Validation Embeddings with additionals')
Xv_add, yv_add = get_validation_docs_with_inference_new_additional(doc2vec_model, doc_classification_map, classifications, 
                                                validation_docs_list, validation_docs_additional_list_file,
                                                additional_validation_docs_list)

2017-01-28 01:33:16,112 : INFO : Getting training Data with additionals
2017-01-28 01:37:04,459 : INFO : Getting Validation Embeddings with additionals
2017-01-28 01:37:04,465 : INFO : ===== Loading validation vectors
2017-01-28 01:37:51,215 : INFO : ===== Getting validation vectors with inference
2017-01-28 01:37:51,252 : INFO : Finished: 100
2017-01-28 01:37:51,254 : INFO : Finished: 100


In [31]:
model = create_keras_nn_model(DOC2VEC_SIZE, NN_OUTPUT_NEURONS, 
                              first_hidden_layer_size, first_hidden_layer_activation, 
                              second_hidden_layer_size, second_hidden_layer_activation, 
                              input_dropout_do, hidden_dropout_do)
model.summary()

early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, \
                                              patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto')
metrics_callback = ClassificationMetricsCallback()

# Model Fitting
%time history = model.fit(x=X_add, y=y_add, validation_data=(Xv_add,yv_add), batch_size=NN_BATCH_SIZE, \
                          nb_epoch=NN_MAX_EPOCHS, verbose=1, callbacks=[early_stopper, metrics_callback])



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
doc_input (InputLayer)           (None, 100)           0                                            
____________________________________________________________________________________________________
hidden_layer_tanh (Dense)        (None, 200)           20200       doc_input[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 200)           0           hidden_layer_tanh[0][0]          
____________________________________________________________________________________________________
hidden_layer2_relu (Dense)       (None, 1000)          201000      dropout_2[0][0]                  
___________________________________________________________________________________________

2017-01-28 01:38:53,657 : INFO : Found lower val loss for epoch 1 => 0.00438


Epoch 2/100
    


2017-01-28 01:39:49,046 : INFO : Found lower val loss for epoch 2 => 0.00424


Epoch 3/100
    


2017-01-28 01:40:36,238 : INFO : Found lower val loss for epoch 3 => 0.00411


Epoch 4/100
    


2017-01-28 01:41:30,147 : INFO : Found lower val loss for epoch 4 => 0.00402


Epoch 5/100
    


2017-01-28 01:42:25,027 : INFO : Found lower val loss for epoch 5 => 0.00394


Epoch 6/100
Epoch 7/100
    


2017-01-28 01:44:14,166 : INFO : Found lower val loss for epoch 7 => 0.00391


Epoch 8/100
    


2017-01-28 01:45:09,883 : INFO : Found lower val loss for epoch 8 => 0.00389


Epoch 9/100
    


2017-01-28 01:46:04,332 : INFO : Found lower val loss for epoch 9 => 0.00387


Epoch 10/100
    


2017-01-28 01:47:02,648 : INFO : Found lower val loss for epoch 10 => 0.00387


Epoch 11/100
    


2017-01-28 01:47:56,794 : INFO : Found lower val loss for epoch 11 => 0.00386


Epoch 12/100
    


2017-01-28 01:48:52,095 : INFO : Found lower val loss for epoch 12 => 0.00385


Epoch 13/100
    


2017-01-28 01:49:48,526 : INFO : Found lower val loss for epoch 13 => 0.00382


Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
    


2017-01-28 01:53:02,283 : INFO : Found lower val loss for epoch 17 => 0.00382


Epoch 18/100
Epoch 19/100
    


2017-01-28 01:54:44,082 : INFO : Found lower val loss for epoch 19 => 0.00381


Epoch 00018: early stopping
CPU times: user 6min 26s, sys: 10min 32s, total: 16min 59s
Wall time: 16min 50s


In [32]:

info('Evaluating on Validation Data using last weights')
yvp = model.predict(Xv_add)
yvp_binary = get_binary_0_5(yvp)
#print yvp
info('Generating Validation Metrics')
validation_metrics = get_metrics(yv_add, yvp, yvp_binary)
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])
last_validation_metrics = validation_metrics

# using the recorded weights of the best recorded validation loss
last_model_weights = model.get_weights()
info('Evaluating on Validation Data using saved best weights')
model.set_weights(metrics_callback.best_weights)
yvp = model.predict(Xv_add)
yvp_binary = get_binary_0_5(yvp)
#print yvp
info('Generating Validation Metrics')
validation_metrics = get_metrics(yv_add, yvp, yvp_binary)
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])
best_validation_metrics = validation_metrics


2017-01-28 01:54:44,133 : INFO : Evaluating on Validation Data using last weights
2017-01-28 01:56:03,888 : INFO : Generating Validation Metrics
2017-01-28 02:04:58,356 : INFO : Evaluating on Validation Data using saved best weights


****** Validation Metrics: Cov Err: 8.808 | Top 3: 0.701 | Top 5: 0.786 | F1 Micro: 0.457 | F1 Macro: 0.068


2017-01-28 02:06:14,680 : INFO : Generating Validation Metrics


****** Validation Metrics: Cov Err: 8.808 | Top 3: 0.701 | Top 5: 0.786 | F1 Micro: 0.457 | F1 Macro: 0.068


In [99]:
validation_metrics = last_validation_metrics
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])

validation_metrics = best_validation_metrics
print "****** Validation Metrics: Cov Err: {:.3f} | Top 3: {:.3f} | Top 5: {:.3f} | F1 Micro: {:.3f} | F1 Macro: {:.3f}".format(
    validation_metrics['coverage_error'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'])

****** Validation Metrics: Cov Err: 8.691 | Top 3: 0.702 | Top 5: 0.788 | F1 Micro: 0.468 | F1 Macro: 0.072
****** Validation Metrics: Cov Err: 8.691 | Top 3: 0.702 | Top 5: 0.788 | F1 Micro: 0.468 | F1 Macro: 0.072
