In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [5]:
models = [LogisticRegression(solver='newton-cg', max_iter=1000), DecisionTreeClassifier(), \
          RandomForestClassifier(n_estimators= 20), SVC(kernel='poly'), GaussianNB()]
skfold = StratifiedKFold(n_splits=3)
for model in models:
    for train_index, test_index in skfold.split(iris.data, iris.target):
        X_train, X_test, y_train, y_test = iris.data[train_index], iris.data[test_index], \
                                           iris.target[train_index], iris.target[test_index]
        model.fit(X_train, y_train)
        print('Model: {}, Score: {}'.format(model, model.score(X_test, y_test)))

Model: LogisticRegression(max_iter=1000, solver='newton-cg'), Score: 0.98
Model: LogisticRegression(max_iter=1000, solver='newton-cg'), Score: 0.96
Model: LogisticRegression(max_iter=1000, solver='newton-cg'), Score: 0.98
Model: DecisionTreeClassifier(), Score: 0.98
Model: DecisionTreeClassifier(), Score: 0.92
Model: DecisionTreeClassifier(), Score: 1.0
Model: RandomForestClassifier(n_estimators=20), Score: 0.98
Model: RandomForestClassifier(n_estimators=20), Score: 0.94
Model: RandomForestClassifier(n_estimators=20), Score: 0.94
Model: SVC(kernel='poly'), Score: 0.98
Model: SVC(kernel='poly'), Score: 0.94
Model: SVC(kernel='poly'), Score: 0.98
Model: GaussianNB(), Score: 0.92
Model: GaussianNB(), Score: 0.94
Model: GaussianNB(), Score: 0.96


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

In [7]:
model = GridSearchCV(SVC(),{
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'C':[1, 2,3, 4, 5, 10, 20, 50],
    'gamma':['scale','auto']
}, cv = 5, )
model.fit(iris.data, iris.target)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 2, 3, 4, 5, 10, 20, 50],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [8]:
print(model.best_params_)
print(model.best_score_)

{'C': 4, 'gamma': 'auto', 'kernel': 'rbf'}
0.9866666666666667


In [9]:
model = RandomizedSearchCV(SVC(),{
       'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
       'C':[1, 2,3, 4, 5, 10, 20, 50],
       'gamma':['scale','auto']     
}, cv = 5, n_iter = 20)
model.fit(iris.data, iris.target)

RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=20,
                   param_distributions={'C': [1, 2, 3, 4, 5, 10, 20, 50],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']})

In [10]:
print(model.best_params_)
print(model.best_score_)

{'kernel': 'rbf', 'gamma': 'scale', 'C': 5}
0.9866666666666667


#### Exercise

In [11]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [12]:
from sklearn.datasets import load_digits
digits = load_digits()

In [13]:
models = {
        'svc':{'model':SVC(), 
           'params':{
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        'C':[1,  5, 10, 50],
        'gamma': ['scale', 'auto']
        }
    },
        'logit':{'model':LogisticRegression(max_iter=1000), 
           'params':{
        'solver':['liblinear', 'newton-cg'],
        'C':[1, 5, 10, 50],
        }
    },
    
        'tree':{'model':DecisionTreeClassifier(), 
           'params':{
        'criterion' : ["gini", "entropy"],
        }
    },
        'forest':{'model':RandomForestClassifier(), 
           'params':{
        'criterion' : ["gini", "entropy"],
        'n_estimators' :[10, 50, 100, 200],
        }
    },
    
        'multinb':{'model':MultinomialNB(), 
           'params':{
        'alpha':[1, 5, 10, 20, 50]
        }
    },
        'gaussnb':{'model':GaussianNB(), 
           'params':{
               
        }
    },
        'bernnb':{'model':BernoulliNB(), 
           'params':{
        'alpha':[1, 5, 10, 20, 50]
        }
    },    
}

In [14]:
scores = []
for model_name, model_param in models.items():
    clf = GridSearchCV(model_param['model'], model_param['params'], cv = 5)
    clf.fit(digits.data, digits.target)
    scores.append({
     'model':model_name,
     'best score': clf.best_score_,
     'best params': clf.best_params_
    })

In [15]:
pd.DataFrame(scores)

Unnamed: 0,model,best score,best params
0,svc,0.97385,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}"
1,logit,0.922114,"{'C': 1, 'solver': 'liblinear'}"
2,tree,0.8086,{'criterion': 'entropy'}
3,forest,0.941035,"{'criterion': 'gini', 'n_estimators': 100}"
4,multinb,0.877027,{'alpha': 50}
5,gaussnb,0.806928,{}
6,bernnb,0.824174,{'alpha': 1}
