## Imports

In [26]:
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz

from sklearn.datasets import load_boston, load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

import seaborn as sns

## Read in data

In [3]:
data = load_wine()
print("Feature names:", data.feature_names)
print("Target names:", data.target_names)
print("Shape:", data.data.shape)

Feature names: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Target names: ['class_0' 'class_1' 'class_2']
Shape: (178, 13)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.20)

## Logistic Regression

In [28]:
dt = DecisionTreeClassifier()
parameters = {'criterion': ('gini', 'entropy'), 'min_samples_leaf':[1, 3, 5]}
clf = GridSearchCV(dt, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                            params  mean_fit_time  \
1     {'criterion': 'gini', 'min_samples_leaf': 3}       0.000704   
2     {'criterion': 'gini', 'min_samples_leaf': 5}       0.001102   
0     {'criterion': 'gini', 'min_samples_leaf': 1}       0.000964   
3  {'criterion': 'entropy', 'min_samples_leaf': 1}       0.001164   
4  {'criterion': 'entropy', 'min_samples_leaf': 3}       0.000997   
5  {'criterion': 'entropy', 'min_samples_leaf': 5}       0.001414   

   mean_score_time  mean_test_score  
1         0.000212         0.936620  
2         0.000210         0.936620  
0         0.000467         0.922535  
3         0.000248         0.908451  
4         0.000543         0.908451  
5         0.000435         0.901408  


## Decision Trees

In [29]:
lr = LogisticRegression()
parameters = {'fit_intercept': (True, False), 'penalty':('l1', 'l2')}
clf = GridSearchCV(lr, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                      params  mean_fit_time  mean_score_time  \
0   {'fit_intercept': True, 'penalty': 'l1'}       0.008740         0.000365   
2  {'fit_intercept': False, 'penalty': 'l1'}       0.008785         0.000200   
1   {'fit_intercept': True, 'penalty': 'l2'}       0.001618         0.000567   
3  {'fit_intercept': False, 'penalty': 'l2'}       0.002788         0.000200   

   mean_test_score  
0         0.943662  
2         0.943662  
1         0.936620  
3         0.936620  


## kNN

In [30]:
nbrs = KNeighborsClassifier()
parameters = {'n_neighbors':[1, 3, 5, 10], 'weights': ('uniform', 'distance')}
clf = GridSearchCV(nbrs, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))

                                       params  mean_fit_time  mean_score_time  \
0    {'n_neighbors': 1, 'weights': 'uniform'}       0.000992         0.000962   
1   {'n_neighbors': 1, 'weights': 'distance'}       0.000863         0.000928   
3   {'n_neighbors': 3, 'weights': 'distance'}       0.000613         0.000691   
5   {'n_neighbors': 5, 'weights': 'distance'}       0.000405         0.000545   
7  {'n_neighbors': 10, 'weights': 'distance'}       0.000618         0.000985   
4    {'n_neighbors': 5, 'weights': 'uniform'}       0.000321         0.001191   
6   {'n_neighbors': 10, 'weights': 'uniform'}       0.000426         0.001188   
2    {'n_neighbors': 3, 'weights': 'uniform'}       0.000586         0.000825   

   mean_test_score  
0         0.760563  
1         0.760563  
3         0.753521  
5         0.753521  
7         0.746479  
4         0.697183  
6         0.697183  
2         0.669014  


## Neural Nets

In [31]:
mlp = MLPClassifier(solver='sgd', learning_rate='constant')
parameters = {'hidden_layer_sizes':[(50), (100), (150), (100, 50)], 'learning_rate_init': [0.0005, 0.001, 0.005, 0.01]}
clf = GridSearchCV(mlp, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))



                                               params  mean_fit_time  \
8   {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.052262   
9   {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.013589   
6   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.034115   
14  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.012175   
5   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.032898   
0   {'hidden_layer_sizes': 50, 'learning_rate_init...       0.008187   
4   {'hidden_layer_sizes': 100, 'learning_rate_ini...       0.006209   
2   {'hidden_layer_sizes': 50, 'learning_rate_init...       0.018190   
3   {'hidden_layer_sizes': 50, 'learning_rate_init...       0.005799   
10  {'hidden_layer_sizes': 150, 'learning_rate_ini...       0.010582   
12  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.008174   
15  {'hidden_layer_sizes': (100, 50), 'learning_ra...       0.009559   
1   {'hidden_layer_sizes': 50, 'learning_rate_init...       0.03

In [32]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'base_estimator__criterion':['gini', 'entropy'], 
              'base_estimator__splitter': ['best', 'random'],
              'base_estimator__min_samples_leaf': [1, 3, 5, 10, 20],
              'n_estimators': [5, 10, 20, 50]}
clf = GridSearchCV(ada, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_fit_time', 'mean_score_time', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))

                                               params  mean_fit_time  \
15  {'base_estimator__criterion': 'gini', 'base_es...       0.131209   
71  {'base_estimator__criterion': 'entropy', 'base...       0.098376   
54  {'base_estimator__criterion': 'entropy', 'base...       0.043065   
63  {'base_estimator__criterion': 'entropy', 'base...       0.115075   
79  {'base_estimator__criterion': 'entropy', 'base...       0.098432   
75  {'base_estimator__criterion': 'entropy', 'base...       0.123875   
31  {'base_estimator__criterion': 'gini', 'base_es...       0.100545   
35  {'base_estimator__criterion': 'gini', 'base_es...       0.115530   
23  {'base_estimator__criterion': 'gini', 'base_es...       0.104104   
27  {'base_estimator__criterion': 'gini', 'base_es...       0.122681   

    mean_score_time  mean_test_score  
15         0.007412         0.992958  
71         0.006735         0.992958  
54         0.002815         0.985915  
63         0.007850         0.985915  
79         0

In [35]:
gbm = GradientBoostingClassifier()
parameters = {'min_samples_leaf': [1, 3, 5, 10, 20],
              'n_estimators': [5, 10, 20, 50, 100]}
clf = GridSearchCV(gbm, parameters, cv=5, return_train_score=True)
clf.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['params', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False).head(10))

                                           params  mean_test_score
14   {'min_samples_leaf': 5, 'n_estimators': 100}         0.978873
0      {'min_samples_leaf': 1, 'n_estimators': 5}         0.957746
7     {'min_samples_leaf': 3, 'n_estimators': 20}         0.957746
23   {'min_samples_leaf': 20, 'n_estimators': 50}         0.957746
19  {'min_samples_leaf': 10, 'n_estimators': 100}         0.957746
9    {'min_samples_leaf': 3, 'n_estimators': 100}         0.957746
8     {'min_samples_leaf': 3, 'n_estimators': 50}         0.957746
12    {'min_samples_leaf': 5, 'n_estimators': 20}         0.957746
24  {'min_samples_leaf': 20, 'n_estimators': 100}         0.957746
3     {'min_samples_leaf': 1, 'n_estimators': 50}         0.957746
4    {'min_samples_leaf': 1, 'n_estimators': 100}         0.957746
2     {'min_samples_leaf': 1, 'n_estimators': 20}         0.950704
1     {'min_samples_leaf': 1, 'n_estimators': 10}         0.950704
18   {'min_samples_leaf': 10, 'n_estimators': 50}         0.95