In [1]:
import numpy as np
import pandas as pd

from scipy.stats import uniform
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV,cross_val_score,RandomizedSearchCV
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

12.1 : Selecting the Best Models Using Exhaustive Search

In [3]:
iris = load_iris()
features = iris.data
target = iris.target

logistic = LogisticRegression(max_iter=500,solver='liblinear')

penalty = ['l1','l2']

C = np.logspace(0,4,10)

hyperparametes = dict(C=C,penalty=penalty)

gridsearch = GridSearchCV(logistic,hyperparametes,cv=5,verbose=0)

best_model = gridsearch.fit(features,target)
print(best_model.best_estimator_)

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


Discussion

In [4]:
np.logspace(0,4,10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [5]:
print("Best Penalty:",best_model.best_estimator_.get_params()['penalty'])
print('Best C:' ,best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 7.742636826811269


In [6]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

12.2 : Selecting the Best Models Using Randomized Search

In [7]:
iris = load_iris()
features = iris.data
target = iris.target

logistic = LogisticRegression(max_iter=500,solver='liblinear')
penalty=['l1','l2']
C = uniform(loc=0,scale=4)
hyperparameters = dict(C=C,penalty=penalty)
randomizedsearch = RandomizedSearchCV(logistic,
                                      hyperparameters,
                                      random_state=1,
                                      n_iter=100,
                                      cv=5,
                                      verbose=0,
                                      n_jobs=-1)

best_model = randomizedsearch.fit(features,target)
print(best_model.best_estimator_)

LogisticRegression(C=np.float64(1.668088018810296), max_iter=500, penalty='l1',
                   solver='liblinear')


Discussion

In [8]:
uniform(loc=0,scale=4).rvs(10)

array([1.17778676, 2.6003579 , 1.82747617, 3.07350887, 1.04304574,
       3.14826742, 3.00778544, 3.21427802, 1.30794044, 3.6879455 ])

In [9]:
print("Best Penalty:",best_model.best_estimator_.get_params()['penalty'])
print('Best C:' ,best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 1.668088018810296


In [10]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

12.3 : Selecting the Best Models from Multiple learning Algorithms

In [11]:
np.random.seed(0)

iris = load_iris()
features= iris.data
target = iris.target

pipe = Pipeline([('classifier',RandomForestClassifier())])
search_space = [{'classifier':[LogisticRegression(max_iter=500,solver='liblinear')],
                 "classifier__penalty":['l1','l2'],
                 "classifier__C":np.logspace(0,4,10)},
                 {'classifier':[RandomForestClassifier()],
                  "classifier__n_estimators":[10,100,1000],
                  "classifier__max_features":[1,2,3]}]

gridsearch = GridSearchCV(pipe,search_space,cv=5,verbose=0)

best_model = gridsearch.fit(features,target)

print(best_model.best_estimator_)

Pipeline(steps=[('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=500, penalty='l1',
                                    solver='liblinear'))])


Discussion

In [12]:
print(best_model.best_estimator_.get_params()["classifier"])

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


In [13]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

12.4 : Selecting the Best Models When Preprocessing

In [14]:
np.random.seed(0)

iris = load_iris()
features = iris.data
target = iris.target

preprocess = FeatureUnion([("std",StandardScaler()),("pca",PCA())])

pipe = Pipeline([("preprocess",preprocess),
                 ("classifier",LogisticRegression(max_iter=1000,solver = 'liblinear'))])

search_space = [{"preprocess__pca__n_components" : [1,2,3],
                 "classifier__penalty":["l1","l2"],
                 "classifier__C":np.logspace(0,4,10)}]

clf = GridSearchCV(pipe,search_space,cv=5,verbose=0,n_jobs=-1)

best_model = clf.fit(features,target)
print(best_model.best_estimator_)

Pipeline(steps=[('preprocess',
                 FeatureUnion(transformer_list=[('std', StandardScaler()),
                                                ('pca', PCA(n_components=1))])),
                ('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=1000, penalty='l1',
                                    solver='liblinear'))])


Discussion

In [15]:
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

1

12.5 : Speeding Up Model Selection with Parallelization

In [16]:
iris = load_iris()
features ==iris.data
target = iris.target

logistic = LogisticRegression(max_iter=500,solver='liblinear')

penalty=['l1','l2']
C=np.logspace(0,4,1000)
hyperparameters = dict(C=C,penalty=penalty)
gridsearch = GridSearchCV(logistic,
                          hyperparameters,
                          cv=5,
                          n_jobs=-1,
                          verbose=1)
best_model = gridsearch.fit(features,target)
print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=np.float64(5.926151812475554), max_iter=500, penalty='l1',
                   solver='liblinear')


Discussion

In [17]:
clf = GridSearchCV(logistic,
                   hyperparameters,
                   cv=5,
                   n_jobs=1,
                   verbose=1)

best_model=clf.fit(features,target)
print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=np.float64(5.926151812475554), max_iter=500, penalty='l1',
                   solver='liblinear')


12.6 : Speeding Up Model selction Using Algorithm Specfic Methods

In [18]:
iris = load_iris()
features = iris.data
target = iris.target

logit = LogisticRegressionCV(Cs=100,max_iter=500,solver="liblinear")

logit.fit(features,target)
print(logit)

LogisticRegressionCV(Cs=100, max_iter=500, solver='liblinear')


12.7 : Evaluating Performance After Model Selection

In [20]:
iris = load_iris()
features = iris.data
target = iris.target

logistic = LogisticRegression(max_iter=500,solver='liblinear')

C=np.logspace(0,4,20)
hyperparameters = dict(C=C)
gridsearch = GridSearchCV(logistic,hyperparameters,cv=5,n_jobs=-1,verbose=0)

cross_val_score(gridsearch,features,target).mean()

np.float64(0.9733333333333334)

Discussion

In [21]:
gridsearch = GridSearchCV(logistic,hyperparameters,cv=5,verbose=1)
best_model = gridsearch.fit(features,target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [22]:
score = cross_val_score(gridsearch,features,target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
