In [13]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import cPickle as pickle

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model

import logging
from logging import info

from thesis.utils.metrics import *

In [14]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [15]:
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

In [16]:
SVM_SEED = 1234

In [18]:
SVM_ITERATIONS = 10
SVM_CONVERGENCE = 0.001
SVM_REG = 0.01
SVM_CLASS_WEIGHTS = 'balanced'
SVM_MODEL_NAME = 'svm_iter_{}_reg_{}_classweights_{}'.format(SVM_ITERATIONS, SVM_REG, str(SVM_CLASS_WEIGHTS))

CLASSIFIER_FILE = '{}_classifier.pkl'
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'

In [19]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"
svm_location = root_location + "benchmarking_svm/"


training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
classification_index_file = exports_location + "classification_index.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"


In [20]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        #if len(doc_classification_map[doc_id]) > 20: info("PROOOOBBBBBBBBBBBLEM "+  str(doc_classification_map[doc_id]))
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels)
    return data_labels

#### Load Classification Objects

In [21]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
classifications_index = pickle.load(open(classification_index_file))

CPU times: user 49.6 s, sys: 14.8 s, total: 1min 4s
Wall time: 7min 32s


## Determining Valid classes and subclasses

In [22]:
len(valid_classes)

244

In [23]:
len(valid_subclasses)

940

## Training and Validation Loop

In [None]:
classifications = valid_subclasses
classifications_type = "subclasses"

In [None]:
%%time
#data_types = ["sublinear_tf", "sublinear_tf_idf", "bm25", "tf_idf", "tf"]
data_types = ["tf"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
    data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    info('Getting Training Data')
    %time X = pickle.load(open(data_training_location, "r"))
    training_data_docids = pickle.load(open(data_training_docids_location, "r"))
    %time y = get_label_data(classifications, training_data_docids, doc_classification_map)
    
    print y
    print y.shape

    info('Training Classifier')
    clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
                                                         #alpha is the 1/C parameter
                                                         alpha=SVM_REG, fit_intercept=True, n_iter=SVM_ITERATIONS,
                                                         #n_jobs=-1 means use all cpus
                                                         shuffle=True, verbose=0, n_jobs=1,
                                                         #eta0 is the learning rate when we use constant configuration
                                                         random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
                                                         class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
    %time clf.fit(X,y)
    
    # Training Metrics
    info('Evaluating on Training Data')
    %time yp = clf.predict(X)
    %time yp_score = clf.decision_function(X)
    print yp
    info('Calculating training metrics')
    training_metrics = get_metrics(y, yp_score, yp)
    print "** Training Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        training_metrics['coverage_error'], training_metrics['average_num_of_labels'], 
        training_metrics['top_1'], training_metrics['top_3'], training_metrics['top_5'], 
        training_metrics['f1_micro'], training_metrics['f1_macro'], training_metrics['total_positive'])
    
    # Get the validation data
    info('Getting Valdiation Data')
    %time Xv = pickle.load(open(data_validation_location,'r'))
    validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
    %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Validation Metrics
    info('Evaluating on Validation Data')
    %time yvp = clf.predict(Xv)
    %time yvp_score = clf.decision_function(Xv)
    print yvp
    validation_metrics = get_metrics(yv, yvp_score, yvp)
    print "** Validation Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}, Total Pos: {:,d}".format(
        validation_metrics['coverage_error'], validation_metrics['average_num_of_labels'], 
        validation_metrics['top_1'], validation_metrics['top_3'], validation_metrics['top_5'], 
        validation_metrics['f1_micro'], validation_metrics['f1_macro'], validation_metrics['total_positive'])
    
    # Dump the classifier and metrics
    data_folder = os.path.join(svm_location, SVM_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(clf, open(os.path.join(data_folder, CLASSIFIER_FILE.format(classifications_type)), "w"))
    pickle.dump(training_metrics, open(os.path.join(data_folder, TRAINING_METRICS_FILENAME.format(classifications_type)), "w"))
    pickle.dump(validation_metrics, open(os.path.join(data_folder, VALIDATION_METRICS_FILENAME.format(classifications_type)), "w"))
    
    del X, y, Xv, yv

2017-01-30 14:34:04,390 : INFO : Getting Training Data


In [37]:
data_training_location = exports_location + "{}_training_sparse_data.pkl".format("bm25")
%time X = pickle.load(open(data_training_location, "r"))

CPU times: user 12min 26s, sys: 43.3 s, total: 13min 10s
Wall time: 13min 40s


In [39]:
data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format("bm25")
%time training_data_docids = pickle.load(open(data_training_docids_location, "r"))
%time y = get_label_data(classifications, training_data_docids, doc_classification_map)

CPU times: user 1min 37s, sys: 10.2 s, total: 1min 48s
Wall time: 1min 47s


In [47]:
X[:100].shape

(100, 10000)

In [None]:
%%time
clf = OneVsRestClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l2', 
             #alpha is the 1/C parameter
             alpha=SVM_REG, fit_intercept=True, n_iter=20,
             #n_jobs=-1 means use all cpus
             shuffle=True, verbose=1, n_jobs=1,
             #eta0 is the learning rate when we use constant configuration
             random_state=SVM_SEED, learning_rate='optimal', eta0=0.0, 
             class_weight=SVM_CLASS_WEIGHTS, warm_start=False), n_jobs=1)
clf.fit(X,y)