In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import cPickle as pickle

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model

import logging
from logging import info

from thesis.utils.metrics import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

In [4]:
SVM_SEED = 1234

In [22]:
SVM_ITERATIONS = 10
SVM_CONVERGENCE = 0.001
SVM_REG = 0.01
SVM_CLASS_WEIGHTS = None
SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))

CLASSIFIER_FILE = '{}_classifier.pkl'
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'

In [6]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"
svm_location = root_location + "benchmarking_svm/"


training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"


In [8]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        #if len(doc_classification_map[doc_id]) > 20: info("PROOOOBBBBBBBBBBBLEM "+  str(doc_classification_map[doc_id]))
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels)
    return data_labels

#### Load Classification Objects

In [9]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))

CPU times: user 39.6 s, sys: 3.85 s, total: 43.5 s
Wall time: 48.6 s


## Training and Validation Loop

In [23]:
classifications = sections
classifications_type = "sections"

In [24]:
data_types = ["bm25", "tf_idf", "sublinear_tf_idf", "sublinear_tf", "tf"]
# data_types = ["tf"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
    data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    %time X = pickle.load(open(data_training_location, "r"))
    training_data_docids = pickle.load(open(data_training_docids_location, "r"))
    %time y = get_label_data(classifications, training_data_docids, doc_classification_map)
    
    print y
    print y.shape

    info('Training Classifier')
    clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/C parameter
                                                         alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=0, n_jobs=1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
    %time clf.fit(X,y)
    
    # Training Metrics
    info('Evaluating on Training Data')
    %time yp = clf.predict(X)
    print yp
    info('Calculating training metrics')
    training_metrics = get_metrics(y, yp, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    # Get the validation data
    %time Xv = pickle.load(open(data_validation_location,'r'))
    validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
    %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Validation Metrics
    info('Evaluating on Validation Data')
    %time yvp = clf.predict(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    # Dump the classifier and metrics
    data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))
    pickle.dump(training_metrics, open(os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w"))
    pickle.dump(validation_metrics, open(os.path.join(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)), "w"))



CPU times: user 12min 12s, sys: 3min 44s, total: 15min 57s
Wall time: 17min 17s


2017-01-14 09:40:50,730 : INFO : Training Classifier


CPU times: user 15.1 s, sys: 12.6 s, total: 27.7 s
Wall time: 20min 15s
[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [1 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 1]]
(1286325, 8)


2017-01-14 09:44:40,654 : INFO : Evaluating on Training Data


CPU times: user 3min 17s, sys: 32.3 s, total: 3min 49s
Wall time: 3min 49s


2017-01-14 09:44:55,894 : INFO : Calculating training metrics


CPU times: user 13.7 s, sys: 1.52 s, total: 15.2 s
Wall time: 15.2 s
[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]]
** Training Metrics: Cov Err: 3.521, Avg Labels: 1.150, 
		 Top 1: 0.703, Top 3: 0.835, Top 5: 0.861, 
		 F1 Micro: 0.732, F1 Macro: 0.570, Total Pos: 1,203,912
CPU times: user 2min 59s, sys: 10.8 s, total: 3min 10s
Wall time: 3min 12s


2017-01-14 09:50:03,732 : INFO : Evaluating on Validation Data


CPU times: user 2.44 s, sys: 440 ms, total: 2.88 s
Wall time: 2.87 s
CPU times: user 3.19 s, sys: 152 ms, total: 3.34 s
Wall time: 3.34 s
[[0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]]
** Validation Metrics: Cov Err: 4.737, Avg Labels: 1.150, 
		 Top 1: 0.590, Top 3: 0.741, Top 5: 0.766, 
		 F1 Micro: 0.614, F1 Macro: 0.391, Total Pos: 202,286




CPU times: user 12min 16s, sys: 1min 32s, total: 13min 49s
Wall time: 14min 36s


2017-01-14 10:05:33,128 : INFO : Training Classifier


CPU times: user 11.8 s, sys: 2.66 s, total: 14.4 s
Wall time: 14.7 s
[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 [0 1 0 ..., 1 0 0]
 ..., 
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 [0 1 0 ..., 0 0 0]]
(1286325, 8)


2017-01-14 10:08:32,816 : INFO : Evaluating on Training Data


CPU times: user 2min 54s, sys: 4.73 s, total: 2min 59s
Wall time: 2min 59s


2017-01-14 10:08:45,530 : INFO : Calculating training metrics


CPU times: user 12.6 s, sys: 140 ms, total: 12.7 s
Wall time: 12.7 s
[[0 1 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 1 0 1]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 [0 1 0 ..., 1 0 0]]
** Training Metrics: Cov Err: 3.682, Avg Labels: 1.150, 
		 Top 1: 0.682, Top 3: 0.831, Top 5: 0.855, 
		 F1 Micro: 0.711, F1 Macro: 0.602, Total Pos: 1,200,163
CPU times: user 3min 9s, sys: 6.18 s, total: 3min 15s
Wall time: 3min 27s


2017-01-14 10:14:06,444 : INFO : Evaluating on Validation Data


CPU times: user 2.43 s, sys: 396 ms, total: 2.82 s
Wall time: 2.8 s
CPU times: user 3.09 s, sys: 28 ms, total: 3.12 s
Wall time: 3.12 s
[[0 0 0 ..., 0 0 1]
 [1 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.800, Avg Labels: 1.150, 
		 Top 1: 0.662, Top 3: 0.816, Top 5: 0.841, 
		 F1 Micro: 0.697, F1 Macro: 0.566, Total Pos: 294,836




CPU times: user 11min 10s, sys: 1min 10s, total: 12min 21s
Wall time: 13min 13s


2017-01-14 10:28:18,652 : INFO : Training Classifier


CPU times: user 11.5 s, sys: 2.93 s, total: 14.4 s
Wall time: 14.3 s
[[0 1 0 ..., 0 1 1]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 0 1]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
(1286325, 8)


2017-01-14 10:32:32,905 : INFO : Evaluating on Training Data


CPU times: user 4min 3s, sys: 10.1 s, total: 4min 13s
Wall time: 4min 14s


2017-01-14 10:33:03,900 : INFO : Calculating training metrics


CPU times: user 29.2 s, sys: 1.73 s, total: 30.9 s
Wall time: 31 s
[[0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 0 1]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Training Metrics: Cov Err: 3.911, Avg Labels: 1.150, 
		 Top 1: 0.663, Top 3: 0.804, Top 5: 0.829, 
		 F1 Micro: 0.699, F1 Macro: 0.506, Total Pos: 1,078,561
CPU times: user 2min 46s, sys: 15.5 s, total: 3min 1s
Wall time: 3min 24s


2017-01-14 10:39:40,777 : INFO : Evaluating on Validation Data


CPU times: user 5.48 s, sys: 592 ms, total: 6.07 s
Wall time: 6.15 s
CPU times: user 8.73 s, sys: 88 ms, total: 8.82 s
Wall time: 8.86 s
[[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 3.921, Avg Labels: 1.150, 
		 Top 1: 0.662, Top 3: 0.803, Top 5: 0.828, 
		 F1 Micro: 0.698, F1 Macro: 0.505, Total Pos: 269,447




CPU times: user 12min 11s, sys: 7min 55s, total: 20min 6s
Wall time: 21min 54s


2017-01-14 11:03:04,655 : INFO : Training Classifier


CPU times: user 22.4 s, sys: 6.2 s, total: 28.6 s
Wall time: 28.7 s
[[0 0 0 ..., 1 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
(1286325, 8)


2017-01-14 11:12:08,680 : INFO : Evaluating on Training Data


CPU times: user 7min 9s, sys: 17.9 s, total: 7min 26s
Wall time: 9min 4s


2017-01-14 11:12:49,606 : INFO : Calculating training metrics


CPU times: user 20 s, sys: 776 ms, total: 20.7 s
Wall time: 40.9 s
[[0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
** Training Metrics: Cov Err: 3.892, Avg Labels: 1.150, 
		 Top 1: 0.660, Top 3: 0.803, Top 5: 0.829, 
		 F1 Micro: 0.698, F1 Macro: 0.500, Total Pos: 1,096,052
CPU times: user 3min 3s, sys: 14.4 s, total: 3min 17s
Wall time: 4min 40s


2017-01-14 11:22:31,233 : INFO : Evaluating on Validation Data


CPU times: user 3.23 s, sys: 408 ms, total: 3.64 s
Wall time: 5.87 s
CPU times: user 8.45 s, sys: 148 ms, total: 8.6 s
Wall time: 9.96 s
[[0 0 0 ..., 0 0 1]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 ..., 
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]]
** Validation Metrics: Cov Err: 3.898, Avg Labels: 1.150, 
		 Top 1: 0.660, Top 3: 0.803, Top 5: 0.828, 
		 F1 Micro: 0.698, F1 Macro: 0.499, Total Pos: 274,088




CPU times: user 14min 1s, sys: 14min 21s, total: 28min 22s
Wall time: 43min 35s


2017-01-14 12:07:48,436 : INFO : Training Classifier


CPU times: user 14.6 s, sys: 2.71 s, total: 17.3 s
Wall time: 26.4 s
[[0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 0]]
(1286325, 8)


2017-01-14 12:17:15,860 : INFO : Evaluating on Training Data


CPU times: user 5min 34s, sys: 13.9 s, total: 5min 48s
Wall time: 9min 27s


2017-01-14 12:18:08,051 : INFO : Calculating training metrics


CPU times: user 40 s, sys: 728 ms, total: 40.7 s
Wall time: 52.2 s
[[0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]]
** Training Metrics: Cov Err: 3.784, Avg Labels: 1.150, 
		 Top 1: 0.667, Top 3: 0.837, Top 5: 0.864, 
		 F1 Micro: 0.686, F1 Macro: 0.568, Total Pos: 1,278,646
CPU times: user 3min 34s, sys: 38 s, total: 4min 12s
Wall time: 8min 28s


2017-01-14 12:31:42,216 : INFO : Evaluating on Validation Data


CPU times: user 4.5 s, sys: 468 ms, total: 4.97 s
Wall time: 6.25 s
CPU times: user 7.47 s, sys: 84 ms, total: 7.55 s
Wall time: 12.4 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]]
** Validation Metrics: Cov Err: 3.804, Avg Labels: 1.150, 
		 Top 1: 0.665, Top 3: 0.836, Top 5: 0.863, 
		 F1 Micro: 0.684, F1 Macro: 0.566, Total Pos: 319,296


## Load Training and Validation Data

In [9]:
data_type = "bm25"
classifications = sections
classifications_type = "sections"
data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)

In [10]:
%%time
%time X = pickle.load(open(data_training_location, "r"))
training_data_docids = pickle.load(open(data_training_docids_location, "r"))
%time y = get_label_data(classifications, training_data_docids, doc_classification_map)

CPU times: user 11min 59s, sys: 5min 26s, total: 17min 25s
Wall time: 20min 53s
CPU times: user 22.4 s, sys: 6.44 s, total: 28.9 s
Wall time: 29.5 s
CPU times: user 12min 24s, sys: 5min 33s, total: 17min 58s
Wall time: 21min 27s


In [11]:
data.shape

NameError: name 'data' is not defined

In [11]:
print y
print y.shape

# try class weights
# try warm start and evaluate after every iter

info('Training Classifier')
clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                     #alpha is the 1/C parameter
                                                     alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                     #n_jobs=-1 means use all cpus
                                                     shuffle=True, verbose=1, n_jobs=1,
                                                     #eta0 is the learning rate when we use constant configuration
                                                     random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                     class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)


2017-01-03 23:41:22,437 : INFO : Training Classifier


[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [1 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 1]]
(1286325, 8)


In [12]:
%%time
# Training Metrics
clf.fit(X,y)
info('Evaluating on Training Data')
yp = clf.predict(X)
print yp
info('Calculating training metrics')
training_metrics = get_metrics(y, yp, yp)
print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
    training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
    training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])

-- Epoch 1
Norm: 5.12, NNZs: 9968, Bias: -0.470338, T: 1286325, Avg. loss: 0.943358
Total training time: 6.43 seconds.
-- Epoch 2
Norm: 3.83, NNZs: 9971, Bias: -0.475705, T: 2572650, Avg. loss: 0.615495
Total training time: 9.84 seconds.
-- Epoch 3
Norm: 3.40, NNZs: 9976, Bias: -0.477937, T: 3858975, Avg. loss: 0.496949
Total training time: 13.03 seconds.
-- Epoch 4
Norm: 3.21, NNZs: 9978, Bias: -0.479755, T: 5145300, Avg. loss: 0.435043
Total training time: 16.14 seconds.
-- Epoch 5
Norm: 3.10, NNZs: 9978, Bias: -0.480922, T: 6431625, Avg. loss: 0.396697
Total training time: 18.90 seconds.
-- Epoch 6
Norm: 3.02, NNZs: 9978, Bias: -0.482003, T: 7717950, Avg. loss: 0.370547
Total training time: 21.81 seconds.
-- Epoch 7
Norm: 2.98, NNZs: 9978, Bias: -0.482518, T: 9004275, Avg. loss: 0.351453
Total training time: 26.43 seconds.
-- Epoch 8
Norm: 2.95, NNZs: 9981, Bias: -0.483186, T: 10290600, Avg. loss: 0.336957
Total training time: 29.96 seconds.
-- Epoch 9
Norm: 2.91, NNZs: 9981, Bias: 

2017-01-04 00:14:09,647 : INFO : Evaluating on Training Data


Norm: 2.35, NNZs: 9237, Bias: 0.046628, T: 128632500, Avg. loss: 0.360278
Total training time: 234.53 seconds.


2017-01-04 00:14:23,416 : INFO : Calculating training metrics


[[1 0 1 ..., 0 0 0]
 [1 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 1]
 ..., 
 [1 0 0 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]]


ValueError: Unknown format code 'd' for object of type 'float'

In [13]:
%%time
%time Xv = pickle.load(open(data_validation_location,'r'))
validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
%time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)

CPU times: user 2min 56s, sys: 5.49 s, total: 3min 1s
Wall time: 3min 4s


2017-01-04 00:30:20,277 : INFO : 0
2017-01-04 00:30:20,308 : INFO : 1000
2017-01-04 00:30:20,319 : INFO : PROOOOBBBBBBBBBBBLEM [u'B', u'B-27', u'B-27-B', u'B-27-L', u'B-27-M', u'B-29', u'B-29-C', u'B-32', u'B-32-B', u'B-41', u'B-41-M', u'B-44', u'B-44-C', u'D', u'D-06', u'D-06-N', u'E', u'E-04', u'E-04-B', u'E-04-F', u'G', u'G-03', u'G-03-G']
2017-01-04 00:30:20,321 : INFO : 2000
2017-01-04 00:30:20,352 : INFO : 3000
2017-01-04 00:30:20,360 : INFO : PROOOOBBBBBBBBBBBLEM [u'B', u'B-05', u'B-05-D', u'B-23', u'B-23-K', u'B-32', u'B-32-B', u'C', u'C-05', u'C-05-D', u'C-08', u'C-08-J', u'C-23', u'C-23-C', u'D', u'D-06', u'D-06-N', u'G', u'G-03', u'G-03-B', u'H', u'H-05', u'H-05-H']
2017-01-04 00:30:20,365 : INFO : 4000
2017-01-04 00:30:20,376 : INFO : 5000
2017-01-04 00:30:20,386 : INFO : 6000
2017-01-04 00:30:20,397 : INFO : 7000
2017-01-04 00:30:20,408 : INFO : 8000
2017-01-04 00:30:20,419 : INFO : 9000
2017-01-04 00:30:20,429 : INFO : 10000
2017-01-04 00:30:20,440 : INFO : 11000
2017-01-

CPU times: user 3.02 s, sys: 448 ms, total: 3.47 s
Wall time: 3.2 s
CPU times: user 2min 59s, sys: 6 s, total: 3min 5s
Wall time: 3min 8s


In [16]:
print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
    training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
    training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])

** Training Metrics: Cov Err: 2.600, Avg Labels: 1.150, 
		 Top 1: 0.686, Top 3: 0.931, Top 5: 0.972, 
		 F1 Micro: 0.687, F1 Macro: 0.580, Total Pos: 2,340,112


In [48]:
len(validation_data_docids)

321473

In [14]:
info('Evaluating on Validation Data')
yvp = clf.predict(Xv)
print yvp
validation_metrics = get_metrics(yv, yvp, yvp)
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
    validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])

2017-01-04 00:36:08,808 : INFO : Evaluating on Validation Data


[[0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 1 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [1 1 1 ..., 0 0 0]
 [0 1 0 ..., 0 0 1]
 [0 0 0 ..., 0 1 0]]
** Validation Metrics: Cov Err: 2.815, Avg Labels: 1.150, 
		 Top 1: 0.656, Top 3: 0.921, Top 5: 0.963, 
		 F1 Micro: 0.650, F1 Macro: 0.578, Total Pos: 617,435


In [20]:
data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
os.mkdir(data_folder)
pickle.dump(clf, open(data_folder, CLASSIFIER_FILE.format(classifications_type)))
pickle.dump(training_metrics, open(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)))
pickle.dump(validation_metrics, open(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)))