In [None]:
#12.1 Selecting the Best Models Using Exhaustive Search

# Load libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of candidate penalty hyperparameter values
penalty = ['l1', 'l2']

# Create range of candidate regularization hyperparameter values
c = np.logspace(0, 4 ,10) # regularization strength

# Create dictionary of hyperparameter candidates
hyperparameters = dict(C=c, penalty = penalty)

# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# Fit grid search
best_model = gridsearch.fit(features, target) # همه ی 100 ترکیب را امتحان می کند

# Show the best model
print(best_model.best_estimator_)

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


In [5]:
np.logspace(0, 4, 10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [7]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 7.742636826811269


In [8]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
#12.2 Selecting the Best Models Using Randomized Search

# Load libraries
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of candidate regularization penalty hyperparameter values
penalty = ['l1', 'l2']

# Create distribution of candidate regularization hyperparameter values
c = uniform(loc = 0, scale = 4) # توزیع یکنواخت در بازه 0 تا 4

# Create hyperparameter options
hyperparameters =dict(C= c, penalty = penalty)

# Create randomized search
randomizedsearch = RandomizedSearchCV(
    logistic, hyperparameters,
    random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1
) # صد ترکیب تصادفی انتخاب و تست می شوند

# Fit randomized search
best_model = randomizedsearch.fit(features, target)

# Print best model
print(best_model.best_estimator_)

LogisticRegression(C=np.float64(1.668088018810296), max_iter=500, penalty='l1',
                   solver='liblinear')


In [11]:
# Define a uniform distribution between 0 and 4, sample 10 values
uniform(loc = 0, scale = 4).rvs(10)


array([1.43893646, 3.24635686, 3.01179634, 0.69845277, 3.86401201,
       0.40687058, 1.23771923, 3.64231474, 2.76522408, 2.73122822])

In [12]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])


Best Penalty: l1
Best C: 1.668088018810296


In [13]:
# Predict target vector
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
#12.3 Selecting the Best Models from Multiple Learning Algorithms

# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Set random seed
np.random.seed(0)

# Load data
iris = datasets.load_iris()
features, target = iris.data, iris.target

# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())]) # RandomForestClassifier() = مقدار اولیه 

# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression(max_iter=500,
                                                   solver='liblinear')
                                ],
                  'classifier__penalty': ['l1', 'l2'],
                  'classifier__C': np.logspace(0, 4, 10)
                },
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators':[10, 100, 1000], # تعداد درخت‌ها (۱۰، ۱۰۰ یا ۱۰۰۰)
                 'classifier__max_features': [1, 2 ,3] # تعداد ویژگی‌هایی که هر درخت درنظر می‌گیره
                }
               ]

# Create grid search
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fit grid search
best_model = gridsearch.fit(features, target)

# Print best model
print(best_model.best_estimator_)

Pipeline(steps=[('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=500, penalty='l1',
                                    solver='liblinear'))])


In [15]:
# View best model
print(best_model.best_estimator_.get_params()['classifier'])

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


In [16]:
# Predict target vector
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
#12.4 Selecting the Best Models When Preprocessing

# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Set random seed
np.random.seed(0)

# Load data
iris = datasets.load_iris()
features, target = iris.data, iris.target

# Create a preprocessing object that includes StandardScaler features and PCA
preprocess = FeatureUnion([('std', StandardScaler()), ('pca', PCA())])

# Create a pipeline
pipe = Pipeline([("preprocess", preprocess),
                 ("classifier", LogisticRegression(max_iter=1000,
                                                   solver='liblinear'))])

# Create space of candidate values
search_space = [{'preprocess__pca__n_components': [1, 2 ,3], # تعداد مؤلفه‌های اصلی
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)}]

# Create grid search
clf  = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(features, target)

# Print best model
print(best_model.best_estimator_)



Pipeline(steps=[('preprocess',
                 FeatureUnion(transformer_list=[('std', StandardScaler()),
                                                ('pca', PCA(n_components=1))])),
                ('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=1000, penalty='l1',
                                    solver='liblinear'))])


In [18]:
# View best n_components
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

1

In [21]:
#12.5 Speeding Up Model Selection with Parallelization

# Load libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

# Load data
iris = datasets.load_iris()
features, target = iris.data , iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of candidate regularization penalty hyperparameter values
penalty = ['l1', 'l2']

# Create range of candidate values for C
c = np .logspace(0, 4, 1000)

# Create hyperparameter options
hyperparameters = dict(C = c, penalty=penalty)

# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)

# Fit grid search
best_model = gridsearch.fit(features, target)

# Print best model
print(best_model.best_estimator_)


Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=np.float64(5.926151812475554), max_iter=500, penalty='l1',
                   solver='liblinear')


In [24]:
# Create grid search using one core
clf = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=1, verbose=1)

# Fit grid search
best_model = clf.fit(features, target)

# Print best model
print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=np.float64(5.926151812475554), max_iter=500, penalty='l1',
                   solver='liblinear')


In [29]:
#12.6 Speeding Up Model Selection Using AlgorithmSpecific Methods

# Load libraries
from sklearn import linear_model, datasets

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create cross-validated logistic regression
logit = linear_model.LogisticRegressionCV(Cs=100, # صد پارامتر مختلف 
                                           max_iter= 5000, solver = 'liblinear')

# Train model
logit.fit(features, target)

# Print model
print(logit)

LogisticRegressionCV(Cs=100, max_iter=5000, solver='liblinear')


In [None]:
#12.7 Evaluating Performance After Model Selection

# Load libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of 20 candidate values for C
c= np.logspace(0, 4 ,20)

# Create hyperparameter options
hyperparameters = dict(C= c)

# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)

# Conduct nested cross-validation and output the average score
cross_val_score(gridsearch, features, target).mean()

np.float64(0.9733333333333334)

In [32]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

best_model = gridsearch.fit(features, target)

scores = cross_val_score(gridsearch, features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
