# SVM on Extended PV Doc2vec vectors

In [23]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *
from thesis.utils.file import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
IS_SAMPLE = False

In [4]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234
WORD2VEC_SEED = 1234

In [5]:
NUM_CORES = 8

In [6]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [20]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
TYPE_CLASSIFIER= "{}_classifier.pkl"

TRAINING_DATA_MATRIX = "X_level_{}.npy"
TRAINING_LABELS_MATRIX = "y_{}.npy"
VALIDATION_DATA_MATRIX = "Xv_level_{}.npy"
VALIDATION_LABELS_MATRIX = "yv_{}.npy"

In [8]:
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"

matrices_save_location = root_location + "extended_pv_matrices/"
doc2vec_results_location = root_location + "extended_pv_doc2vec_svm/"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

In [9]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 27.8 s, sys: 1.89 s, total: 29.7 s
Wall time: 29.7 s


In [10]:
len(training_docs_list)

1286325

In [11]:
len(validation_docs_list)

321473

## Utility functions for data loading

In [12]:
def get_training_data(classifications_type, level):
    info("Loading Validation Data from file")
    training_data = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                              TRAINING_DATA_MATRIX.format(level))))
    training_labels = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                TRAINING_LABELS_MATRIX.format(classifications_type))))
    return training_data, training_labels

def get_validation_data(classifications_type, level):
    info("Loading Validation Data from file")
    validation_data = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                VALIDATION_DATA_MATRIX.format(level))))
    validation_labels = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                  VALIDATION_LABELS_MATRIX.format(classifications_type))))
    return validation_data, validation_labels

## Global Param Loop

In [13]:
LEVEL_DOC = 1
LEVEL_DIVISIONS = 2
LEVEL_CHUNKS = 3

In [17]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

In [25]:
GLOBAL_PARMS_TO_RUN = [
    {
        'doc2vec_epoch': 8,
        'classifications': sections,
        'classifications_type': 'sections',
        'parts_level': LEVEL_DOC,
        'svm_iterations': 10,
        'svm_reg': 0.001,
        'svm_class_weights': None
    },
    {
        'doc2vec_epoch': 8,
        'classifications': valid_classes,
        'classifications_type': 'classes',
        'parts_level': LEVEL_DOC,
        'svm_iterations': 10,
        'svm_reg': 0.001,
        'svm_class_weights': None
    },
    {
        'doc2vec_epoch': 8,
        'classifications': valid_subclasses,
        'classifications_type': 'subclasses',
        'parts_level': LEVEL_DOC,
        'svm_iterations': 10,
        'svm_reg': 0.001,
        'svm_class_weights': None
    },
]

## Actual Training, validation and Metrics Loop

In [None]:

for GLOBAL_PARAMS in GLOBAL_PARMS_TO_RUN:
    
    print '==================================== NEW PARAM SET ============================================'
    print {k:v for k,v in GLOBAL_PARAMS.items() if k != 'classifications'}
    
    classifications = GLOBAL_PARAMS['classifications']
    classifications_type = GLOBAL_PARAMS['classifications_type']
    PARTS_LEVEL = GLOBAL_PARAMS['parts_level']
    
    classifier_file = TYPE_CLASSIFIER.format(classifications_type)
    
    SVM_ITERATIONS = GLOBAL_PARAMS['svm_iterations']
    SVM_REG = GLOBAL_PARAMS['svm_reg']
    SVM_CLASS_WEIGHTS = GLOBAL_PARAMS['svm_class_weights']
    GLOBAL_VARS.SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))
    
    placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE, 
                                                                DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE))
    GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
    placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")

    epoch = GLOBAL_PARAMS['doc2vec_epoch']

    GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    print GLOBAL_VARS.MODEL_NAME
    print GLOBAL_VARS.SVM_MODEL_NAME
    
    time.sleep(0.2)
    
    info("Loading Training Documents")
    X, y = get_training_data(classifications_type, PARTS_LEVEL)
    print X.shape
    print y.shape
    
    info("Loading Validation Documents")
    Xv, yv = get_validation_data(classifications_type, PARTS_LEVEL)
    print Xv.shape
    print yv.shape
    
    info("Reshaping")
    X = np.reshape(X, (X.shape[0], X.shape[1]* X.shape[2]))
    Xv = np.reshape(Xv, (Xv.shape[0], Xv.shape[1]* Xv.shape[2]))
    print X.shape
    print Xv.shape
    
    
    VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'.format(classifications_type)
    TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'.format(classifications_type)

    
    ensure_disk_location_exists(os.path.join(doc2vec_results_location, GLOBAL_VARS.MODEL_NAME, 
                                             GLOBAL_VARS.SVM_MODEL_NAME))

    if not os.path.exists(os.path.join(doc2vec_results_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, classifier_file)):

        info('Training Classifier')
        clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                             #alpha is the 1/C parameter
                                                             alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                             #n_jobs=-1 means use all cpus
                                                             shuffle=True, verbose=0, n_jobs=1,
                                                             #eta0 is the learning rate when we use constant configuration
                                                             random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                             class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)

        # Training of a classifier
        %time clf.fit(X,y)
        pickle.dump(clf, open(os.path.join(doc2vec_results_location, GLOBAL_VARS.MODEL_NAME, 
                                                              GLOBAL_VARS.SVM_MODEL_NAME, classifier_file), 'w'))

        del X, y

    else:
        info('Loading Classifier')
        clf = pickle.load(open(os.path.join(doc2vec_results_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, classifier_file), 'r'))

    info('Evaluating on Validation Data')
    yvp = clf.predict(Xv)
    yvp_score = clf.decision_function(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp_score, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])

    # Saving the metrics
    #     pickle.dump(training_metrics, open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, 
    #                                                           GLOBAL_VARS.SVM_MODEL_NAME, TRAINING_METRICS_FILENAME), 'w'))
    pickle.dump(validation_metrics, open(os.path.join(doc2vec_results_location, GLOBAL_VARS.MODEL_NAME, 
                                                          GLOBAL_VARS.SVM_MODEL_NAME, VALIDATION_METRICS_FILENAME), 'w'))

    del Xv, yv, yvp, yvp_score

{'classifications_type': 'sections', 'parts_level': 1, 'svm_reg': 0.001, 'svm_iterations': 10, 'svm_class_weights': None, 'doc2vec_epoch': 8}
doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8
svm_iter_10_reg_0.001_classweights_None


2017-04-12 16:11:57,597 : INFO : Loading Training Documents
2017-04-12 16:11:57,600 : INFO : Loading Validation Data from file
2017-04-12 16:11:58,058 : INFO : Loading Validation Documents
2017-04-12 16:11:58,060 : INFO : Loading Validation Data from file
2017-04-12 16:11:58,172 : INFO : Reshaping
2017-04-12 16:11:58,174 : INFO : Loading Classifier
2017-04-12 16:11:58,179 : INFO : Evaluating on Validation Data


(1286325, 1, 200)
(1286325, 8)
(321473, 1, 200)
(321473, 8)
(1286325, 200)
(321473, 200)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 [0 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 1.734, Avg Labels: 1.150, 
		 Top 1: 0.711, Top 3: 0.923, Top 5: 0.981, 
		 F1 Micro: 0.663, F1 Macro: 0.569, Total Pos: 307,991
{'classifications_type': 'classes', 'parts_level': 1, 'svm_reg': 0.001, 'svm_iterations': 10, 'svm_class_weights': None, 'doc2vec_epoch': 8}
doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8
svm_iter_10_reg_0.001_classweights_None


2017-04-12 16:12:25,661 : INFO : Loading Training Documents
2017-04-12 16:12:25,662 : INFO : Loading Validation Data from file
2017-04-12 16:12:26,255 : INFO : Loading Validation Documents
2017-04-12 16:12:26,256 : INFO : Loading Validation Data from file
2017-04-12 16:12:26,399 : INFO : Reshaping
2017-04-12 16:12:26,401 : INFO : Training Classifier


(1286325, 1, 200)
(1286325, 244)
(321473, 1, 200)
(321473, 244)
(1286325, 200)
(321473, 200)


  str(classes[c]))
2017-04-12 16:50:57,590 : INFO : Evaluating on Validation Data


CPU times: user 36min 30s, sys: 2min 2s, total: 38min 32s
Wall time: 38min 31s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


** Validation Metrics: Cov Err: 5.044, Avg Labels: 1.240, 
		 Top 1: 0.467, Top 3: 0.665, Top 5: 0.787, 
		 F1 Micro: 0.541, F1 Macro: 0.127, Total Pos: 257,911
{'classifications_type': 'subclasses', 'parts_level': 1, 'svm_reg': 0.001, 'svm_iterations': 10, 'svm_class_weights': None, 'doc2vec_epoch': 8}
doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8
svm_iter_10_reg_0.001_classweights_None


2017-04-12 16:54:49,738 : INFO : Loading Training Documents
2017-04-12 16:54:49,740 : INFO : Loading Validation Data from file
2017-04-12 16:54:50,693 : INFO : Loading Validation Documents
2017-04-12 16:54:50,694 : INFO : Loading Validation Data from file


(1286325, 1, 200)
(1286325, 940)


2017-04-12 16:54:50,928 : INFO : Reshaping
2017-04-12 16:54:50,930 : INFO : Training Classifier


(321473, 1, 200)
(321473, 940)
(1286325, 200)
(321473, 200)


  str(classes[c]))
  str(classes[c]))
