In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import cPickle as pickle

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model

import logging
from logging import info

from thesis.utils.metrics import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

In [4]:
SVM_SEED = 1234

In [5]:
SVM_ITERATIONS = 10
SVM_CONVERGENCE = 0.001
SVM_REG = 0.001
SVM_CLASS_WEIGHTS = None
SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))

CLASSIFIER_FILE = '{}_classifier.pkl'
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'
TEST_METRICS_FILENAME = '{}_test_metrics.pkl'

In [9]:
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"
svm_location = root_location + "benchmarking_svm/"


training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
classification_index_file = exports_location + "classification_index.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"


In [10]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    classifications_set = set(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        #if len(doc_classification_map[doc_id]) > 20: info("PROOOOBBBBBBBBBBBLEM "+  str(doc_classification_map[doc_id]))
#         eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        eligible_classifications = set(doc_classification_map[doc_id]) & classifications_set
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels, dtype=np.int8)
    return data_labels

#### Load Classification Objects

In [11]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classification_index_file))

CPU times: user 45.1 s, sys: 18.9 s, total: 1min 3s
Wall time: 1min 4s


In [9]:
len(valid_classes)

244

In [10]:
len(valid_subclasses)

940

#### Use this for classes and subclasses as we trained the classes and subclasses classifiers with the unordered list of classes and subclasses

In [62]:
INVALID_CLASSIFICATION_LIMIT = 3
invalid_classes = set()
invalid_subclasses = set()
for clsf in classifications_index.keys():
    if len(classifications_index[clsf]) < INVALID_CLASSIFICATION_LIMIT:
        if clsf in classes:
            invalid_classes.add(clsf)
        if clsf in subclasses:
            invalid_subclasses.add(clsf)
valid_classes = list(set(classes) - invalid_classes)
valid_subclasses = list(set(subclasses) - invalid_subclasses)

## Training and Validation Loop

In [15]:
classifications = sections
classifications_type = "sections"

In [None]:
%%time
data_types = ["sublinear_tf"]
# data_types = ["tf"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
    data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    info('Getting Training Data')
    %time X = pickle.load(open(data_training_location, "r"))
    training_data_docids = pickle.load(open(data_training_docids_location, "r"))
    %time y = get_label_data(classifications, training_data_docids, doc_classification_map)
    
    print y
    print y.shape

    info('Training Classifier')
    clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/nC parameter
                                                         alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=0, n_jobs=1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
    %time clf.fit(X,y)
    
    # Training Metrics
    info('Evaluating on Training Data')
    %time yp = clf.predict(X)
    %time yp_score = clf.decision_function(X)
    print yp
    info('Calculating training metrics')
    training_metrics = get_metrics(y, yp_score, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    # Get the validation data
    info('Getting Valdiation Data')
    %time Xv = pickle.load(open(data_validation_location,'r'))
    validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
    %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Validation Metrics
    info('Evaluating on Validation Data')
    %time yvp = clf.predict(Xv)
    %time yvp_score = clf.decision_function(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp_score, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    # Dump the classifier and metrics
    data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))
#     pickle.dump(training_metrics, open(os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w"))
#     pickle.dump(validation_metrics, open(os.path.join(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)), "w"))
    
    del X, y, Xv, yv

2017-01-31 22:30:16,656 : INFO : Getting Training Data


## Training and Validation SVM QP

In [39]:
from sklearn import svm

In [66]:
classifications = valid_classes
classifications_type = "classes"

In [67]:
SVM_C = 0.001
SVM_MODEL_NAME = 'liblinear_svm_iter_C_{}'.format(SVM_C)

In [68]:
data_type = "sublinear_tf"

data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    

In [69]:
%%time

# Get the training data
info('Getting Training Data')
%time X = pickle.load(open(data_training_location, "r"))
training_data_docids = pickle.load(open(data_training_docids_location, "r"))
%time y = get_label_data(classifications, training_data_docids, doc_classification_map)

print y
print y.shape


2017-02-28 19:43:43,142 : INFO : Getting Training Data


CPU times: user 15min 41s, sys: 23min 42s, total: 39min 24s
Wall time: 39min 22s
CPU times: user 34.6 s, sys: 943 ms, total: 35.6 s
Wall time: 35.5 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(1286325, 244)
CPU times: user 16min 18s, sys: 23min 44s, total: 40min 2s
Wall time: 40min


In [None]:
clf = OneVsRestClassifier(svm.LinearSVC(C=SVM_C, verbose=1, loss="squared_hinge"))

In [None]:
%%time
clf.fit(X,y)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

  str(classes[c]))


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

OneVsRestClassifier(estimator=LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=1),
          n_jobs=1)

In [38]:
# Dump the classifier and metrics
data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))

In [None]:
%%time
# Training Metrics
info('Evaluating on Training Data')
%time yp = clf.predict(X)
%time yp_score = clf.decision_function(X)
print yp
info('Calculating training metrics')
training_metrics = get_metrics(y, yp_score, yp)
print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
    training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
    training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])


2017-02-28 21:44:43,096 : INFO : Evaluating on Training Data


CPU times: user 4min 42s, sys: 414 ms, total: 4min 42s
Wall time: 4min 42s


2017-02-28 21:54:46,762 : INFO : Calculating training metrics


CPU times: user 4min 41s, sys: 39.5 s, total: 5min 21s
Wall time: 5min 20s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [None]:
%%time
# Get the validation data
info('Getting Valdiation Data')
%time Xv = pickle.load(open(data_validation_location,'r'))
validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
%time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)


In [None]:
%%time
# Validation Metrics
info('Evaluating on Validation Data')
%time yvp = clf.predict(Xv)
%time yvp_score = clf.decision_function(Xv)
print yvp
validation_metrics = get_metrics(yv, yvp_score, yvp)
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
    validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])


In [75]:
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
    validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])


** Validation Metrics: Cov Err: 4.620, Avg Labels: 1.240, 
		 Top 1: 0.521, Top 3: 0.771, Top 5: 0.877, 
		 F1 Micro: 0.601, F1 Macro: 0.109, Total Pos: 234,459


#### Training Metrics for 1 iter

In [26]:
%%time
# Training Metrics
info('Evaluating on Training Data')
%time yp = clf.predict(X)
%time yp_score = clf.decision_function(X)
print yp
info('Calculating training metrics')
training_metrics = get_metrics(y, yp_score, yp)
print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
    training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
    training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])


2017-02-26 18:36:30,715 : INFO : Evaluating on Training Data


CPU times: user 10.4 s, sys: 809 ms, total: 11.3 s
Wall time: 11.2 s


2017-02-26 18:36:51,800 : INFO : Calculating training metrics


CPU times: user 9.57 s, sys: 265 ms, total: 9.83 s
Wall time: 9.82 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
** Training Metrics: Cov Err: 1.798, Avg Labels: 1.150, 
		 Top 1: 0.698, Top 3: 0.910, Top 5: 0.976, 
		 F1 Micro: 0.628, F1 Macro: 0.566, Total Pos: 1,259,795
CPU times: user 1min 52s, sys: 1.7 s, total: 1min 54s
Wall time: 1min 53s


In [None]:
%%time
data_types = ["sublinear_tf"]
# data_types = ["tf"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
    data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    info('Getting Training Data')
    %time X = pickle.load(open(data_training_location, "r"))
    training_data_docids = pickle.load(open(data_training_docids_location, "r"))
    %time y = get_label_data(classifications, training_data_docids, doc_classification_map)
    
    print y
    print y.shape

    info('Training Classifier')
    clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/C parameter
                                                         alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=0, n_jobs=1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
    %time clf.fit(X,y)
    
    # Training Metrics
    info('Evaluating on Training Data')
    %time yp = clf.predict(X)
    %time yp_score = clf.decision_function(X)
    print yp
    info('Calculating training metrics')
    training_metrics = get_metrics(y, yp_score, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    # Get the validation data
    info('Getting Valdiation Data')
    %time Xv = pickle.load(open(data_validation_location,'r'))
    validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
    %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Validation Metrics
    info('Evaluating on Validation Data')
    %time yvp = clf.predict(Xv)
    %time yvp_score = clf.decision_function(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp_score, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    # Dump the classifier and metrics
    data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))
#     pickle.dump(training_metrics, open(os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w"))
#     pickle.dump(validation_metrics, open(os.path.join(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)), "w"))
    
    del X, y, Xv, yv

## Testing

In [10]:
classifications = sections
classifications_type = "sections"

In [11]:
data_type = "sublinear_tf"
data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
clf = pickle.load(open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "r"))

In [None]:
data_test_location = exports_location + "{}_test_sparse_data.pkl".format(data_type)
data_test_docids_location = exports_location + "{}_test_sparse_docids.pkl".format(data_type)

# Get the test data
info('Getting Test Data')
%time Xt = pickle.load(open(data_test_location, "r"))
test_data_docids = pickle.load(open(data_test_docids_location, "r"))
%time yt = get_label_data(classifications, test_data_docids, doc_classification_map)


2017-02-14 16:21:10,906 : INFO : Getting Test Data


In [14]:
# Test Metrics
info('Evaluating on Test Data')
%time ytp = clf.predict(Xt)
%time ytp_score = clf.decision_function(Xt)
print ytp
%time test_metrics = get_metrics(yt, ytp_score, ytp)
print "** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    test_metrics['coverage_error'], test_metrics['average_num_of_labels'], 
    test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'], 
    test_metrics['f1_micro'], test_metrics['f1_macro'], test_metrics['total_positive'])

# pickle.dump(test_metrics, open(os.path.join(data_folder, TEST_METRICS_FILENAME.format(classifications_type)), "w"))
    

2017-02-14 16:30:01,480 : INFO : Evaluating on Test Data


CPU times: user 3.96 s, sys: 28 ms, total: 3.99 s
Wall time: 3.99 s
CPU times: user 3.68 s, sys: 8 ms, total: 3.68 s
Wall time: 3.68 s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]]
CPU times: user 33 s, sys: 64 ms, total: 33.1 s
Wall time: 33.1 s
** Test Metrics: Cov Err: 1.571, Avg Labels: 1.150, 
		 Top 1: 0.787, Top 3: 0.949, Top 5: 0.981, 
		 F1 Micro: 0.731, F1 Macro: 0.572, Total Pos: 373,008


In [79]:
pickle.dump(test_metrics, open(os.path.join(data_folder, TEST_METRICS_FILENAME.format(classifications_type)), "w"))
 

In [15]:
conf_matrix = get_formatted_multilabel_confusion_matrix(yt, ytp, sections)
conf_matrix

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,40408.0,1496.0,613.0,3.0,1.0,102.0,1380.0,349.0,14524.0
B,789.0,23203.0,1495.0,6.0,25.0,1269.0,2335.0,1493.0,27604.0
C,489.0,625.0,30998.0,1.0,21.0,37.0,479.0,717.0,7946.0
D,37.0,302.0,164.0,154.0,0.0,4.0,48.0,27.0,1777.0
E,47.0,839.0,52.0,0.0,1761.0,84.0,121.0,67.0,6456.0
F,177.0,1100.0,82.0,0.0,41.0,10885.0,761.0,686.0,13910.0
G,1077.0,1247.0,478.0,0.0,62.0,430.0,109174.0,9721.0,21527.0
H,284.0,618.0,245.0,0.0,10.0,226.0,9295.0,88342.0,20899.0
,3237.0,1870.0,4090.0,19.0,155.0,562.0,9213.0,6880.0,0.0


In [16]:
conf_matrix.sum(axis=1)

A        58876.0
B        58219.0
C        41313.0
D         2513.0
E         9427.0
F        27642.0
G       143716.0
H       119919.0
None     26026.0
dtype: float64

In [23]:
(conf_matrix.div(conf_matrix.sum(axis=1), axis=0).round(4) * 100)

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,68.63,2.54,1.04,0.01,0.0,0.17,2.34,0.59,24.67
B,1.36,39.85,2.57,0.01,0.04,2.18,4.01,2.56,47.41
C,1.18,1.51,75.03,0.0,0.05,0.09,1.16,1.74,19.23
D,1.47,12.02,6.53,6.13,0.0,0.16,1.91,1.07,70.71
E,0.5,8.9,0.55,0.0,18.68,0.89,1.28,0.71,68.48
F,0.64,3.98,0.3,0.0,0.15,39.38,2.75,2.48,50.32
G,0.75,0.87,0.33,0.0,0.04,0.3,75.97,6.76,14.98
H,0.24,0.52,0.2,0.0,0.01,0.19,7.75,73.67,17.43
,12.44,7.19,15.72,0.07,0.6,2.16,35.4,26.44,0.0


In [22]:
format_perc = lambda x: "{:.2f}%".format(x)
(conf_matrix.div(conf_matrix.sum(axis=1), axis=0).round(4) * 100).applymap(format_perc)

Unnamed: 0,A,B,C,D,E,F,G,H,None
A,68.63%,2.54%,1.04%,0.01%,0.00%,0.17%,2.34%,0.59%,24.67%
B,1.36%,39.85%,2.57%,0.01%,0.04%,2.18%,4.01%,2.56%,47.41%
C,1.18%,1.51%,75.03%,0.00%,0.05%,0.09%,1.16%,1.74%,19.23%
D,1.47%,12.02%,6.53%,6.13%,0.00%,0.16%,1.91%,1.07%,70.71%
E,0.50%,8.90%,0.55%,0.00%,18.68%,0.89%,1.28%,0.71%,68.48%
F,0.64%,3.98%,0.30%,0.00%,0.15%,39.38%,2.75%,2.48%,50.32%
G,0.75%,0.87%,0.33%,0.00%,0.04%,0.30%,75.97%,6.76%,14.98%
H,0.24%,0.52%,0.20%,0.00%,0.01%,0.19%,7.75%,73.67%,17.43%
,12.44%,7.19%,15.72%,0.07%,0.60%,2.16%,35.40%,26.44%,0.00%


## Testing for QP SVM

In [33]:
data_type = "sublinear_tf"

In [34]:
classifications = sections
classifications_type = "sections"

In [76]:
%%time
data_test_location = exports_location + "{}_test_sparse_data.pkl".format(data_type)
data_test_docids_location = exports_location + "{}_test_sparse_docids.pkl".format(data_type)

# Get the test data
info('Getting Test Data')
%time Xt = pickle.load(open(data_test_location, "r"))
test_data_docids = pickle.load(open(data_test_docids_location, "r"))
%time yt = get_label_data(classifications, test_data_docids, doc_classification_map)


2017-02-28 23:07:57,015 : INFO : Getting Test Data


CPU times: user 4min 52s, sys: 2min 46s, total: 7min 38s
Wall time: 7min 38s
CPU times: user 9.35 s, sys: 66 ms, total: 9.41 s
Wall time: 9.39 s
CPU times: user 5min 2s, sys: 2min 46s, total: 7min 48s
Wall time: 7min 48s


In [77]:
%%time
# Test Metrics
info('Evaluating on Test Data')
%time ytp = clf.predict(Xt)
%time ytp_score = clf.decision_function(Xt)
print ytp
%time test_metrics = get_metrics(yt, ytp_score, ytp)
print "** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    test_metrics['coverage_error'], test_metrics['average_num_of_labels'], 
    test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'], 
    test_metrics['f1_micro'], test_metrics['f1_macro'], test_metrics['total_positive'])

# pickle.dump(test_metrics, open(os.path.join(data_folder, TEST_METRICS_FILENAME.format(classifications_type)), "w"))

2017-02-28 23:15:45,347 : INFO : Evaluating on Test Data


CPU times: user 1min 41s, sys: 1.17 s, total: 1min 42s
Wall time: 1min 42s
CPU times: user 1min 36s, sys: 4.3 s, total: 1min 41s
Wall time: 1min 41s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
CPU times: user 1min 36s, sys: 33.3 s, total: 2min 9s
Wall time: 2min 9s
** Test Metrics: Cov Err: 4.625, Avg Labels: 1.240, 
		 Top 1: 0.522, Top 3: 0.771, Top 5: 0.878, 
		 F1 Micro: 0.601, F1 Macro: 0.108, Total Pos: 293,696
CPU times: user 4min 54s, sys: 38.7 s, total: 5min 33s
Wall time: 5min 32s


In [39]:
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)

# Get the validation data
info('Getting Valdiation Data')
%time Xv = pickle.load(open(data_validation_location,'r'))
validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
%time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)

# Validation Metrics
info('Evaluating on Validation Data')
%time yvp = clf.predict(Xv)
%time yvp_score = clf.decision_function(Xv)
print yvp
%time validation_metrics = get_metrics(yv, yvp_score, yvp)
print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
    validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
    validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
    validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])


2017-02-01 16:15:34,615 : INFO : Getting Valdiation Data


CPU times: user 2min 47s, sys: 2.84 s, total: 2min 49s
Wall time: 2min 49s


2017-02-01 16:18:47,970 : INFO : Evaluating on Validation Data


CPU times: user 22.1 s, sys: 804 ms, total: 22.9 s
Wall time: 22.8 s
CPU times: user 1min 29s, sys: 152 ms, total: 1min 29s
Wall time: 1min 29s
CPU times: user 1min 28s, sys: 412 ms, total: 1min 28s
Wall time: 1min 28s
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
CPU times: user 2min 53s, sys: 3.35 s, total: 2min 56s
Wall time: 2min 56s
** Validation Metrics: Cov Err: 123.907, Avg Labels: 1.240, 
		 Top 1: 0.001, Top 3: 0.003, Top 5: 0.006, 
		 F1 Micro: 0.000, F1 Macro: 0.000, Total Pos: 214,248
