In [1]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.


In [2]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from lesson_helper_functions import *
import glob
import time

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

cars = glob.glob('vehicles/*/*.png')
notcars = glob.glob('non-vehicles/*/*.png')

color_space = 'YCrCb' # Can be RGB, HSV, LUV, HLS, YUV, YCrCb
orient = 9  # HOG orientations
pix_per_cell = 8 # HOG pixels per cell
cell_per_block = 2 # HOG cells per block
hog_channel = "ALL" # Can be 0, 1, 2, or "ALL"
#spatial_size = (16, 16) # Spatial binning dimensions
#hist_bins = 16    # Number of histogram bins
spatial_size = (32, 32)
hist_bins = 32
spatial_feat = True # Spatial features on or off
hist_feat = True # Histogram features on or off
hog_feat = True # HOG features on or off
y_start_stop = [440, None] # Min and max in y to search in slide_window()



car_features = extract_features(cars, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)
notcar_features = extract_features(notcars, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)  

# Create an array stack of feature vectors
X = np.vstack((car_features, notcar_features)).astype(np.float64)                        
y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features))))


X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

# iterate over classifiers
#for name, clf in zip(names, classifiers):
#    t=time.time()    
#    clf.fit(X_train, y_train)
#    t2 = time.time()
#    print(round(t2-t, 2), 'Seconds to train clf...')
#    score = clf.score(X_test, y_test)
#    t3 = time.time()
#    print(round(t3-t2, 2), 'Seconds to score clf...')
#    print("clf: %s score: %.4f" % (clf, score))

Automatically created module for IPython interactive environment


In [None]:
t=time.time()  
clf = KNeighborsClassifier(3)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

In [None]:
t=time.time()  
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

In [None]:
t=time.time()  
clf = SVC(gamma=2, C=1)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

In [None]:
t=time.time()  
clf = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

In [None]:
t=time.time()  
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

72.01 Seconds to train clf...
0.29 Seconds to score clf...
clf: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') score: 0.9352


In [3]:
t=time.time()  
clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

0.51 Seconds to train clf...
0.39 Seconds to score clf...
clf: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=1, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False) score: 0.8615


In [4]:
t=time.time()  
clf = MLPClassifier(alpha=1)
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

54.16 Seconds to train clf...
0.48 Seconds to score clf...
clf: MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) score: 0.9900


In [5]:
t=time.time()  
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

793.96 Seconds to train clf...
2.78 Seconds to score clf...
clf: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) score: 0.9809


In [6]:
t=time.time()  
clf = GaussianNB()
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))

2.06 Seconds to train clf...
2.01 Seconds to score clf...
clf: GaussianNB(priors=None) score: 0.9585


In [None]:
t=time.time()  
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train clf...')
score = clf.score(X_test, y_test)
t3 = time.time()
print(round(t3-t2, 2), 'Seconds to score clf...')
print("clf: %s score: %.4f" % (clf, score))