# Classification

## Imports

In [44]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz

from sklearn.datasets import load_boston, load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

import seaborn as sns

## Read in Data

In [45]:
data_wine = load_wine()
print("Feature names:", data_wine.feature_names)
print("Target names:", data_wine.target_names)
print("Shape:", data_wine.data.shape)

X_train, X_test, y_train, y_test = train_test_split(data_wine.data, data_wine.target, test_size=0.20)

Feature names: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Target names: ['class_0' 'class_1' 'class_2']
Shape: (178, 13)


In [43]:
data_iris = load_iris()
print("Feature names:", data_iris.feature_names)
print("Target names:", data_iris.target_names)
print("Shape:", data_iris.data.shape)

X_train, X_test, y_train, y_test = train_test_split(data_iris.data, data_iris.target, test_size=0.20)

Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']
Shape: (150, 4)


## Building Models

### Logistic Regression

In [28]:
dt = DecisionTreeClassifier()
parameters = {'criterion': ('gini', 'entropy'), 'min_samples_leaf':[1, 3, 5]}
clf = GridSearchCV(dt, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                            params  mean_fit_time  \
1     {'criterion': 'gini', 'min_samples_leaf': 3}       0.000704   
2     {'criterion': 'gini', 'min_samples_leaf': 5}       0.001102   
0     {'criterion': 'gini', 'min_samples_leaf': 1}       0.000964   
3  {'criterion': 'entropy', 'min_samples_leaf': 1}       0.001164   
4  {'criterion': 'entropy', 'min_samples_leaf': 3}       0.000997   
5  {'criterion': 'entropy', 'min_samples_leaf': 5}       0.001414   

   mean_score_time  mean_test_score  
1         0.000212         0.936620  
2         0.000210         0.936620  
0         0.000467         0.922535  
3         0.000248         0.908451  
4         0.000543         0.908451  
5         0.000435         0.901408  


### Decision Trees

In [29]:
lr = LogisticRegression()
parameters = {'fit_intercept': (True, False), 'penalty':('l1', 'l2')}
clf = GridSearchCV(lr, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                      params  mean_fit_time  mean_score_time  \
0   {'fit_intercept': True, 'penalty': 'l1'}       0.008740         0.000365   
2  {'fit_intercept': False, 'penalty': 'l1'}       0.008785         0.000200   
1   {'fit_intercept': True, 'penalty': 'l2'}       0.001618         0.000567   
3  {'fit_intercept': False, 'penalty': 'l2'}       0.002788         0.000200   

   mean_test_score  
0         0.943662  
2         0.943662  
1         0.936620  
3         0.936620  


### kNN

In [30]:
nbrs = KNeighborsClassifier()
parameters = {'n_neighbors':[1, 3, 5, 10], 'weights': ('uniform', 'distance')}
clf = GridSearchCV(nbrs, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                       params  mean_fit_time  mean_score_time  \
0    {'n_neighbors': 1, 'weights': 'uniform'}       0.000992         0.000962   
1   {'n_neighbors': 1, 'weights': 'distance'}       0.000863         0.000928   
3   {'n_neighbors': 3, 'weights': 'distance'}       0.000613         0.000691   
5   {'n_neighbors': 5, 'weights': 'distance'}       0.000405         0.000545   
7  {'n_neighbors': 10, 'weights': 'distance'}       0.000618         0.000985   
4    {'n_neighbors': 5, 'weights': 'uniform'}       0.000321         0.001191   
6   {'n_neighbors': 10, 'weights': 'uniform'}       0.000426         0.001188   
2    {'n_neighbors': 3, 'weights': 'uniform'}       0.000586         0.000825   

   mean_test_score  
0         0.760563  
1         0.760563  
3         0.753521  
5         0.753521  
7         0.746479  
4         0.697183  
6         0.697183  
2         0.669014  


### Neural Nets

In [36]:
mlp = MLPClassifier(solver='sgd', learning_rate='constant')
parameters = {'hidden_layer_sizes':[(50), (100), (150), (100, 50)], 'learning_rate_init': [0.0005, 0.001, 0.005, 0.01]}
clf = GridSearchCV(mlp, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))



                                               params  mean_fit_time  \
9   {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.013564   
8   {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.018152   
4   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.024933   
5   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.011969   
11  {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.004588   
14  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.010372   
6   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.004388   
13  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.007779   
12  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.007380   
3   {'hidden_layer_sizes': 50, 'learning_rate_init...       0.003989   

    mean_score_time  mean_test_score  
9          0.000000         0.436620  
8          0.000599         0.429577  
4          0.000200         0.422535  
5          0.000598         0.394366  
11         0

### Ensembles

#### Random Forest

In [38]:
rf = RandomForestClassifier()
parameters = {'max_depth': [1, 3, 5, 10],
              'min_samples_leaf': [1, 3, 5, 10, 20],
              'n_estimators': [5, 10, 20, 50]}
clf = GridSearchCV(rf, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))

                                               params  mean_fit_time  \
50  {'max_depth': 5, 'min_samples_leaf': 5, 'n_est...       0.020546   
61  {'max_depth': 10, 'min_samples_leaf': 1, 'n_es...       0.010769   
31  {'max_depth': 3, 'min_samples_leaf': 5, 'n_est...       0.051661   
22  {'max_depth': 3, 'min_samples_leaf': 1, 'n_est...       0.022740   
26  {'max_depth': 3, 'min_samples_leaf': 3, 'n_est...       0.021734   
63  {'max_depth': 10, 'min_samples_leaf': 1, 'n_es...       0.050888   
62  {'max_depth': 10, 'min_samples_leaf': 1, 'n_es...       0.020339   
42  {'max_depth': 5, 'min_samples_leaf': 1, 'n_est...       0.020745   
11  {'max_depth': 1, 'min_samples_leaf': 5, 'n_est...       0.048669   
15  {'max_depth': 1, 'min_samples_leaf': 10, 'n_es...       0.054457   

    mean_score_time  mean_test_score  
50         0.001595         0.971831  
61         0.000598         0.971831  
31         0.003200         0.964789  
22         0.001396         0.964789  
26         0

In [39]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'base_estimator__criterion':['gini', 'entropy'], 
              'base_estimator__min_samples_leaf': [1, 3, 5, 10, 20],
              'n_estimators': [5, 10, 20, 50]}
clf = GridSearchCV(ada, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))

                                               params  mean_fit_time  \
19  {'base_estimator__criterion': 'gini', 'base_es...       0.089950   
39  {'base_estimator__criterion': 'entropy', 'base...       0.070620   
7   {'base_estimator__criterion': 'gini', 'base_es...       0.072016   
14  {'base_estimator__criterion': 'gini', 'base_es...       0.031298   
34  {'base_estimator__criterion': 'entropy', 'base...       0.034716   
35  {'base_estimator__criterion': 'entropy', 'base...       0.081567   
11  {'base_estimator__criterion': 'gini', 'base_es...       0.071824   
30  {'base_estimator__criterion': 'entropy', 'base...       0.032721   
4   {'base_estimator__criterion': 'gini', 'base_es...       0.007579   
5   {'base_estimator__criterion': 'gini', 'base_es...       0.012168   

    mean_score_time  mean_test_score  
19         0.005993         0.978873  
39         0.003588         0.971831  
7          0.003195         0.971831  
14         0.002006         0.964789  
34         0

In [41]:
gbm = GradientBoostingClassifier()
parameters = {'min_samples_leaf': [1, 3, 5, 10, 20],
              'n_estimators': [5, 10, 20, 50, 100]}
clf = GridSearchCV(gbm, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))

                                           params  mean_fit_time  \
14   {'min_samples_leaf': 5, 'n_estimators': 100}       0.158775   
0      {'min_samples_leaf': 1, 'n_estimators': 5}       0.013176   
7     {'min_samples_leaf': 3, 'n_estimators': 20}       0.058444   
23   {'min_samples_leaf': 20, 'n_estimators': 50}       0.072406   
19  {'min_samples_leaf': 10, 'n_estimators': 100}       0.164162   
9    {'min_samples_leaf': 3, 'n_estimators': 100}       0.185703   
8     {'min_samples_leaf': 3, 'n_estimators': 50}       0.130053   
12    {'min_samples_leaf': 5, 'n_estimators': 20}       0.042885   
24  {'min_samples_leaf': 20, 'n_estimators': 100}       0.135820   
2     {'min_samples_leaf': 1, 'n_estimators': 20}       0.033710   

    mean_score_time  mean_test_score  
14         0.000399         0.971831  
0          0.000798         0.957746  
7          0.000599         0.957746  
23         0.000598         0.957746  
19         0.000997         0.957746  
9          0.0007