In [1]:
import time
import pandas as pd
import numpy as np
import cPickle as pickle

# Suppress convergence warning
import warnings
warnings.simplefilter("ignore")

# Machine Learning
import sklearn
import sklearn.ensemble
import sklearn.svm
import sklearn.preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

In [2]:
# Plot
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
%config InlineBackend.figure_format='retina'
rcParams['figure.figsize'] = 8, 5.5

# Plot heat map of a 2D grid search
def plotGridResults2D(x, y, x_label, y_label, grid_scores):
    
    scores = [s[1] for s in grid_scores]
    scores = np.array(scores).reshape(len(x), len(y))

    plt.figure()
    plt.grid('off')
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.RdYlGn)
    plt.xlabel(y_label)
    plt.ylabel(x_label)
    plt.colorbar()
    plt.xticks(np.arange(len(y)), y, rotation=45)
    plt.yticks(np.arange(len(x)), x)
    plt.title('Validation accuracy')


def plotRoC(fpr, tpr):
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.005])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")

In [3]:
# import training dataset
start = time.time()
training = pickle.load(open('trainingFinal.p', 'rb'))
X_train = training.loc[:, training.columns[1:],]
y_train = training.loc[:, 'label']
end = time.time()
end - start

23.800000190734863

In [4]:
# import testing dataset
start = time.time()
testing = pickle.load(open('testingFinal.p', 'rb'))
testing.loc[:, 'label'] = 1
X_test = testing.loc[:, training.columns[1:],]
y_test = testing.loc[:, 'label']
end = time.time()
end - start

0.6729998588562012

In [None]:
# SVM CV training
start = time.time()
C_range = np.r_[np.logspace(-2, 9, 5)]
gamma_range = np.r_[np.logspace(-9, 2, 5)]
gridCoarse = GridSearchCV(sklearn.svm.SVC(C=1.0, kernel='rbf', class_weight='balanced', verbose=False, max_iter=250),
                    {'C' : C_range, 'gamma': gamma_range},
                   scoring='roc_auc', cv=10, n_jobs=-1)
gridCoarse.fit(X_train, y_train)

C_best = np.round(np.log10(gridCoarse.best_params_['C']))
gamma_best = np.round(np.log10(gridCoarse.best_params_['gamma']))

# Fine grid
'''
Cfine_range = np.r_[np.logspace(C_best - 1, C_best + 1, 15)]
gammafine_range = np.r_[np.logspace(gamma_best - 2, gamma_best + 2, 15)]

gridFine = GridSearchCV(sklearn.svm.SVC(C=1.0, kernel='rbf', class_weight='balanced', verbose=False, max_iter=250),
                    {'C' : Cfine_range, 'gamma': gammafine_range},
                   scoring='roc_auc', cv=10, n_jobs=-1)
gridFine.fit(X_train, y_train)

svmbestClf = gridFine.best_estimator_
svmbestClf.probability = True
'''
end = time.time()
end - start

In [6]:
# plot coarse grid
plotGridResults2D(C_range, gamma_range, 'C', 'gamma', gridCoarse.grid_scores_)

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'

In [None]:
# plot fine grid
plotGridResults2D(Cfine_range, gammafine_range, 'C', 'gamma', gridFine.grid_scores_)

In [None]:
svmbestClf = gridCoarse.best_estimator_
svmbestClf.probability = True

In [None]:
svmbestClf.fit(X_train, y_train)
y_pred = svmbestClf.predict(X_test)

print sklearn.metrics.classification_report(y_test, y_pred)

# Predict scores
# y_score = svmbestClf.predict_proba(X_test)[:, 1]

# Plot ROC
sfpr, stpr, _ = roc_curve(y_test, y_pred)

plotRoC(sfpr, stpr)

In [7]:
end-start

-0.0070002079010009766