In [6]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import cPickle as pickle

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model

import logging
from logging import info

from thesis.utils.metrics import *

In [7]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [9]:
RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

In [10]:
SVM_SEED = 1234

In [11]:
SVM_ITERATIONS = 10
SVM_CONVERGENCE = 0.001
SVM_REG = 0.001
SVM_CLASS_WEIGHTS = None
SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))

CLASSIFIER_FILE = '{}_classifier.pkl'
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'
TEST_METRICS_FILENAME = '{}_test_metrics.pkl'

In [12]:
SAMPLE_RATIO = 0.15

In [36]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"
svm_location = root_location + "extended_pv_benchmarking_svm/"


training_file = root_location + "docs_output.json"

classifications_index_file = exports_location + "extended_pv_classifications_index.pkl"
doc_classifications_map_file = exports_location + "extended_pv_doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list_" + str(SAMPLE_RATIO) + ".pkl"


In [14]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    classifications_set = set(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        #if len(doc_classification_map[doc_id]) > 20: info("PROOOOBBBBBBBBBBBLEM "+  str(doc_classification_map[doc_id]))
#         eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        eligible_classifications = set(doc_classification_map[doc_id]) & classifications_set
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels, dtype=np.int8)
    return data_labels

#### Load Classification Objects

In [19]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classifications_index_file))

CPU times: user 9.29 s, sys: 416 ms, total: 9.71 s
Wall time: 9.76 s


In [20]:
len(valid_classes)

244

In [21]:
len(valid_subclasses)

940

## Training and Validation Loop

In [41]:
classifications = valid_subclasses
classifications_type = "subclasses"

In [42]:
%%time
# data_types = ["sublinear_tf"]
data_types = ["tf", "sublinear_tf", "tf_idf", "sublinear_tf_idf","bm25"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
    data_training_location = exports_location + "extended_pv_benchmarking_data/" + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "extended_pv_benchmarking_data/" + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "extended_pv_benchmarking_data/" + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "extended_pv_benchmarking_data/" + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    info('Getting Training Data')
    %time X = pickle.load(open(data_training_location, "r"))
    training_data_docids = pickle.load(open(data_training_docids_location, "r"))
    %time y = get_label_data(classifications, training_data_docids, doc_classification_map)
    
    print y
    print y.shape

    info('Training Classifier')
    clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/C parameter
                                                         alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=0, n_jobs=1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
    %time clf.fit(X,y)
    
    # Training Metrics
    info('Evaluating on Training Data')
    %time yp = clf.predict(X)
    %time yp_score = clf.decision_function(X)
    print yp
    info('Calculating training metrics')
    training_metrics = get_metrics(y, yp_score, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    # Get the validation data
    info('Getting Valdiation Data')
    %time Xv = pickle.load(open(data_validation_location,'r'))
    validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
    %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Validation Metrics
    info('Evaluating on Validation Data')
    %time yvp = clf.predict(Xv)
    %time yvp_score = clf.decision_function(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp_score, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    # Dump the classifier and metrics
    data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))
#     pickle.dump(training_metrics, open(os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w"))
    pickle.dump(validation_metrics, open(os.path.join(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)), "w"))
    
    del X, y, Xv, yv

2017-04-01 21:40:08,341 : INFO : Getting Training Data


CPU times: user 2min 40s, sys: 3.68 s, total: 2min 43s
Wall time: 2min 43s


2017-04-01 21:43:23,523 : INFO : Training Classifier


CPU times: user 29.8 s, sys: 1.11 s, total: 30.9 s
Wall time: 30.9 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(254767, 940)


2017-04-01 22:36:48,030 : INFO : Evaluating on Training Data


CPU times: user 53min 22s, sys: 752 ms, total: 53min 23s
Wall time: 53min 24s
CPU times: user 4min 33s, sys: 336 ms, total: 4min 33s
Wall time: 4min 33s


2017-04-01 22:45:58,449 : INFO : Calculating training metrics


CPU times: user 4min 35s, sys: 1.24 s, total: 4min 36s
Wall time: 4min 36s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


2017-04-01 22:49:38,848 : INFO : Getting Valdiation Data


** Training Metrics: Cov Err: 180.842, Avg Labels: 1.660, 
		 Top 1: 0.348, Top 3: 0.388, Top 5: 0.418, 
		 F1 Micro: 0.394, F1 Macro: 0.236, Total Pos: 430,708
CPU times: user 35.7 s, sys: 596 ms, total: 36.3 s
Wall time: 36.3 s


2017-04-01 22:50:21,657 : INFO : Evaluating on Validation Data


CPU times: user 6.12 s, sys: 256 ms, total: 6.38 s
Wall time: 6.35 s
CPU times: user 1min, sys: 76 ms, total: 1min
Wall time: 1min
CPU times: user 1min, sys: 228 ms, total: 1min
Wall time: 1min
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 171.835, Avg Labels: 1.350, 
		 Top 1: 0.329, Top 3: 0.382, Top 5: 0.411, 
		 F1 Micro: 0.344, F1 Macro: 0.076, Total Pos: 106,078


2017-04-01 22:53:27,329 : INFO : Getting Training Data


CPU times: user 2min 17s, sys: 2.32 s, total: 2min 20s
Wall time: 2min 20s


2017-04-01 22:56:16,639 : INFO : Training Classifier


CPU times: user 27.2 s, sys: 1.33 s, total: 28.6 s
Wall time: 28.5 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(254767, 940)


2017-04-01 23:50:07,307 : INFO : Evaluating on Training Data


CPU times: user 53min 48s, sys: 872 ms, total: 53min 49s
Wall time: 53min 50s
CPU times: user 4min 33s, sys: 352 ms, total: 4min 33s
Wall time: 4min 33s


2017-04-01 23:59:20,656 : INFO : Calculating training metrics


CPU times: user 4min 38s, sys: 1.39 s, total: 4min 39s
Wall time: 4min 39s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


2017-04-02 00:03:00,049 : INFO : Getting Valdiation Data


** Training Metrics: Cov Err: 20.050, Avg Labels: 1.660, 
		 Top 1: 0.288, Top 3: 0.294, Top 5: 0.444, 
		 F1 Micro: 0.419, F1 Macro: 0.042, Total Pos: 144,930
CPU times: user 30.6 s, sys: 444 ms, total: 31 s
Wall time: 31 s


2017-04-02 00:03:38,115 : INFO : Evaluating on Validation Data


CPU times: user 6.73 s, sys: 232 ms, total: 6.96 s
Wall time: 6.93 s
CPU times: user 1min 1s, sys: 76 ms, total: 1min 1s
Wall time: 1min 1s
CPU times: user 1min, sys: 260 ms, total: 1min
Wall time: 1min
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 21.702, Avg Labels: 1.350, 
		 Top 1: 0.302, Top 3: 0.312, Top 5: 0.471, 
		 F1 Micro: 0.438, F1 Macro: 0.038, Total Pos: 33,231


2017-04-02 00:06:44,831 : INFO : Getting Training Data


CPU times: user 2min 11s, sys: 2.4 s, total: 2min 13s
Wall time: 2min 13s


2017-04-02 00:09:30,854 : INFO : Training Classifier


CPU times: user 29.9 s, sys: 2.07 s, total: 32 s
Wall time: 31.9 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(254767, 940)


2017-04-02 01:00:11,759 : INFO : Evaluating on Training Data


CPU times: user 50min 39s, sys: 992 ms, total: 50min 40s
Wall time: 50min 40s
CPU times: user 4min 10s, sys: 356 ms, total: 4min 11s
Wall time: 4min 11s


2017-04-02 01:08:35,637 : INFO : Calculating training metrics


CPU times: user 4min 11s, sys: 1.46 s, total: 4min 12s
Wall time: 4min 12s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


2017-04-02 01:12:16,558 : INFO : Getting Valdiation Data


** Training Metrics: Cov Err: 219.914, Avg Labels: 1.660, 
		 Top 1: 0.366, Top 3: 0.405, Top 5: 0.442, 
		 F1 Micro: 0.421, F1 Macro: 0.234, Total Pos: 407,436
CPU times: user 28.5 s, sys: 440 ms, total: 28.9 s
Wall time: 28.9 s


2017-04-02 01:12:52,030 : INFO : Evaluating on Validation Data


CPU times: user 6.12 s, sys: 320 ms, total: 6.44 s
Wall time: 6.42 s
CPU times: user 55.2 s, sys: 88 ms, total: 55.3 s
Wall time: 55.3 s
CPU times: user 55.1 s, sys: 332 ms, total: 55.5 s
Wall time: 55.5 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 216.948, Avg Labels: 1.350, 
		 Top 1: 0.334, Top 3: 0.388, Top 5: 0.423, 
		 F1 Micro: 0.350, F1 Macro: 0.080, Total Pos: 105,342


2017-04-02 01:15:46,922 : INFO : Getting Training Data


CPU times: user 2min 9s, sys: 2.06 s, total: 2min 11s
Wall time: 2min 11s


2017-04-02 01:18:27,984 : INFO : Training Classifier


CPU times: user 27.5 s, sys: 1.61 s, total: 29.2 s
Wall time: 29.1 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(254767, 940)


2017-04-02 02:09:29,973 : INFO : Evaluating on Training Data


CPU times: user 51min, sys: 932 ms, total: 51min 1s
Wall time: 51min 1s
CPU times: user 4min 10s, sys: 348 ms, total: 4min 10s
Wall time: 4min 10s


2017-04-02 02:17:52,972 : INFO : Calculating training metrics


CPU times: user 4min 10s, sys: 1.31 s, total: 4min 12s
Wall time: 4min 12s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


2017-04-02 02:21:33,340 : INFO : Getting Valdiation Data


** Training Metrics: Cov Err: 24.620, Avg Labels: 1.660, 
		 Top 1: 0.285, Top 3: 0.290, Top 5: 0.438, 
		 F1 Micro: 0.417, F1 Macro: 0.049, Total Pos: 141,224
CPU times: user 28.2 s, sys: 440 ms, total: 28.6 s
Wall time: 28.6 s


2017-04-02 02:22:09,197 : INFO : Evaluating on Validation Data


CPU times: user 6.82 s, sys: 264 ms, total: 7.08 s
Wall time: 7.05 s
CPU times: user 55.1 s, sys: 88 ms, total: 55.2 s
Wall time: 55.2 s
CPU times: user 55.1 s, sys: 332 ms, total: 55.4 s
Wall time: 55.4 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 26.698, Avg Labels: 1.350, 
		 Top 1: 0.292, Top 3: 0.302, Top 5: 0.458, 
		 F1 Micro: 0.427, F1 Macro: 0.037, Total Pos: 32,546


2017-04-02 02:25:04,174 : INFO : Getting Training Data


CPU times: user 2min 11s, sys: 2.22 s, total: 2min 13s
Wall time: 2min 13s


2017-04-02 02:27:49,142 : INFO : Training Classifier


CPU times: user 29.8 s, sys: 1.22 s, total: 31.1 s
Wall time: 31 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(254767, 940)


2017-04-02 03:18:55,170 : INFO : Evaluating on Training Data


CPU times: user 51min 4s, sys: 940 ms, total: 51min 5s
Wall time: 51min 6s
CPU times: user 4min 12s, sys: 316 ms, total: 4min 13s
Wall time: 4min 13s


2017-04-02 03:27:21,100 : INFO : Calculating training metrics


CPU times: user 4min 11s, sys: 1.36 s, total: 4min 12s
Wall time: 4min 12s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


2017-04-02 03:31:01,741 : INFO : Getting Valdiation Data


** Training Metrics: Cov Err: 16.033, Avg Labels: 1.660, 
		 Top 1: 0.371, Top 3: 0.379, Top 5: 0.518, 
		 F1 Micro: 0.513, F1 Macro: 0.203, Total Pos: 185,093
CPU times: user 29.1 s, sys: 664 ms, total: 29.8 s
Wall time: 29.8 s


2017-04-02 03:31:37,813 : INFO : Evaluating on Validation Data


CPU times: user 6 s, sys: 236 ms, total: 6.23 s
Wall time: 6.21 s
CPU times: user 55.4 s, sys: 76 ms, total: 55.4 s
Wall time: 55.5 s
CPU times: user 55.1 s, sys: 348 ms, total: 55.5 s
Wall time: 55.5 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Validation Metrics: Cov Err: 28.304, Avg Labels: 1.350, 
		 Top 1: 0.328, Top 3: 0.343, Top 5: 0.475, 
		 F1 Micro: 0.457, F1 Macro: 0.067, Total Pos: 40,239
CPU times: user 5h 53min 7s, sys: 1min 2s, total: 5h 54min 10s
Wall time: 5h 54min 25s


## Testing

In [10]:
classifications = sections
classifications_type = "sections"

In [11]:
data_type = "sublinear_tf"
data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
clf = pickle.load(open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "r"))

In [None]:
data_test_location = exports_location + "{}_test_sparse_data.pkl".format(data_type)
data_test_docids_location = exports_location + "{}_test_sparse_docids.pkl".format(data_type)

# Get the test data
info('Getting Test Data')
%time Xt = pickle.load(open(data_test_location, "r"))
test_data_docids = pickle.load(open(data_test_docids_location, "r"))
%time yt = get_label_data(classifications, test_data_docids, doc_classification_map)


2017-02-14 16:21:10,906 : INFO : Getting Test Data


In [14]:
# Test Metrics
info('Evaluating on Test Data')
%time ytp = clf.predict(Xt)
%time ytp_score = clf.decision_function(Xt)
print ytp
%time test_metrics = get_metrics(yt, ytp_score, ytp)
print "** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    test_metrics['coverage_error'], test_metrics['average_num_of_labels'], 
    test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'], 
    test_metrics['f1_micro'], test_metrics['f1_macro'], test_metrics['total_positive'])

# pickle.dump(test_metrics, open(os.path.join(data_folder, TEST_METRICS_FILENAME.format(classifications_type)), "w"))
    

2017-02-14 16:30:01,480 : INFO : Evaluating on Test Data


CPU times: user 3.96 s, sys: 28 ms, total: 3.99 s
Wall time: 3.99 s
CPU times: user 3.68 s, sys: 8 ms, total: 3.68 s
Wall time: 3.68 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
CPU times: user 33 s, sys: 64 ms, total: 33.1 s
Wall time: 33.1 s
** Test Metrics: Cov Err: 1.571, Avg Labels: 1.150, 
		 Top 1: 0.787, Top 3: 0.949, Top 5: 0.981, 
		 F1 Micro: 0.731, F1 Macro: 0.572, Total Pos: 373,008


In [79]:
pickle.dump(test_metrics, open(os.path.join(data_folder, TEST_METRICS_FILENAME.format(classifications_type)), "w"))
 

In [15]:
conf_matrix = get_formatted_multilabel_confusion_matrix(yt, ytp, sections)
conf_matrix

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,40408.0,1496.0,613.0,3.0,1.0,102.0,1380.0,349.0,14524.0
B,789.0,23203.0,1495.0,6.0,25.0,1269.0,2335.0,1493.0,27604.0
C,489.0,625.0,30998.0,1.0,21.0,37.0,479.0,717.0,7946.0
D,37.0,302.0,164.0,154.0,0.0,4.0,48.0,27.0,1777.0
E,47.0,839.0,52.0,0.0,1761.0,84.0,121.0,67.0,6456.0
F,177.0,1100.0,82.0,0.0,41.0,10885.0,761.0,686.0,13910.0
G,1077.0,1247.0,478.0,0.0,62.0,430.0,109174.0,9721.0,21527.0
H,284.0,618.0,245.0,0.0,10.0,226.0,9295.0,88342.0,20899.0
,3237.0,1870.0,4090.0,19.0,155.0,562.0,9213.0,6880.0,0.0


In [16]:
conf_matrix.sum(axis=1)

A        58876.0
B        58219.0
C        41313.0
D         2513.0
E         9427.0
F        27642.0
G       143716.0
H       119919.0
None     26026.0
dtype: float64

In [23]:
(conf_matrix.div(conf_matrix.sum(axis=1), axis=0).round(4) * 100)

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,68.63,2.54,1.04,0.01,0.0,0.17,2.34,0.59,24.67
B,1.36,39.85,2.57,0.01,0.04,2.18,4.01,2.56,47.41
C,1.18,1.51,75.03,0.0,0.05,0.09,1.16,1.74,19.23
D,1.47,12.02,6.53,6.13,0.0,0.16,1.91,1.07,70.71
E,0.5,8.9,0.55,0.0,18.68,0.89,1.28,0.71,68.48
F,0.64,3.98,0.3,0.0,0.15,39.38,2.75,2.48,50.32
G,0.75,0.87,0.33,0.0,0.04,0.3,75.97,6.76,14.98
H,0.24,0.52,0.2,0.0,0.01,0.19,7.75,73.67,17.43
,12.44,7.19,15.72,0.07,0.6,2.16,35.4,26.44,0.0


In [22]:
format_perc = lambda x: "{:.2f}%".format(x)
(conf_matrix.div(conf_matrix.sum(axis=1), axis=0).round(4) * 100).applymap(format_perc)

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,68.63%,2.54%,1.04%,0.01%,0.00%,0.17%,2.34%,0.59%,24.67%
B,1.36%,39.85%,2.57%,0.01%,0.04%,2.18%,4.01%,2.56%,47.41%
C,1.18%,1.51%,75.03%,0.00%,0.05%,0.09%,1.16%,1.74%,19.23%
D,1.47%,12.02%,6.53%,6.13%,0.00%,0.16%,1.91%,1.07%,70.71%
E,0.50%,8.90%,0.55%,0.00%,18.68%,0.89%,1.28%,0.71%,68.48%
F,0.64%,3.98%,0.30%,0.00%,0.15%,39.38%,2.75%,2.48%,50.32%
G,0.75%,0.87%,0.33%,0.00%,0.04%,0.30%,75.97%,6.76%,14.98%
H,0.24%,0.52%,0.20%,0.00%,0.01%,0.19%,7.75%,73.67%,17.43%
,12.44%,7.19%,15.72%,0.07%,0.60%,2.16%,35.40%,26.44%,0.00%


In [39]:
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)

# Get the validation data
info('Getting Valdiation Data')
%time Xv = pickle.load(open(data_validation_location,'r'))
validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
%time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)

# Validation Metrics
info('Evaluating on Validation Data')
%time yvp = clf.predict(Xv)
%time yvp_score = clf.decision_function(Xv)
print yvp
%time validation_metrics = get_metrics(yv, yvp_score, yvp)
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
    validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])


2017-02-01 16:15:34,615 : INFO : Getting Valdiation Data


CPU times: user 2min 47s, sys: 2.84 s, total: 2min 49s
Wall time: 2min 49s


2017-02-01 16:18:47,970 : INFO : Evaluating on Validation Data


CPU times: user 22.1 s, sys: 804 ms, total: 22.9 s
Wall time: 22.8 s
CPU times: user 1min 29s, sys: 152 ms, total: 1min 29s
Wall time: 1min 29s
CPU times: user 1min 28s, sys: 412 ms, total: 1min 28s
Wall time: 1min 28s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
CPU times: user 2min 53s, sys: 3.35 s, total: 2min 56s
Wall time: 2min 56s
** Validation Metrics: Cov Err: 123.907, Avg Labels: 1.240, 
		 Top 1: 0.001, Top 3: 0.003, Top 5: 0.006, 
		 F1 Micro: 0.000, F1 Macro: 0.000, Total Pos: 214,248
