# Hyper Parameter Tunning

1.GridSearchCV
2. RandomizedSearchCV
3. Bayesian Optimization -Automate Hyperparameter Tuning (Hyperopt)
4. Sequential Model Based Optimization(Tuning a scikit-learn estimator with skopt)
4. Optuna- Automate Hyperparameter Tuning
5. Genetic Algorithms (TPOT Classifier)

###### References
- https://github.com/fmfn/BayesianOptimization
- https://github.com/hyperopt/hyperopt
- https://www.jeremyjordan.me/hyperparameter-tuning/
- https://optuna.org/
- https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d(By Pier Paolo Ippolito )
- https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html


In [2]:
import pandas as pd
import os


In [3]:
%pwd

'F:\\neel\\fingertips\\python\\jupyternotbook'

In [4]:
os.chdir("F:\\neel\\fingertips\\machine learning\\All-Hyperparamter-Optimization-master")

In [5]:
data = pd.read_csv("diabetes.csv")

In [6]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [7]:
import numpy as np


In [8]:
#replace gluscose where glucose is o beacause gloucose can't be 0
data['Glucose'] = np.where(data['Glucose']==0,data['Glucose'].median(),data['Glucose'])
data['BloodPressure']=np.where(data['BloodPressure']==0,data['BloodPressure'].median(),data['BloodPressure'])
data['Insulin']=np.where(data['Insulin']==0,data['Insulin'].median(),data['Insulin'])


In [9]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35,30.5,33.6,0.627,50,1
1,1,85.0,66.0,29,30.5,26.6,0.351,31,0
2,8,183.0,64.0,0,30.5,23.3,0.672,32,1
3,1,89.0,66.0,23,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27,30.5,36.8,0.340,27,0
765,5,121.0,72.0,23,112.0,26.2,0.245,30,0
766,1,126.0,60.0,0,30.5,30.1,0.349,47,1


In [10]:
#independent and dependent variable
x= data.drop('Outcome',axis='columns')

In [11]:
y= data['Outcome']

In [12]:
#split data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [13]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)

In [14]:
model.fit(x_train,y_train)

RandomForestClassifier(n_estimators=10)

In [15]:
prediction = model.predict(x_test)

In [16]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [17]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[86  9]
 [31 28]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.74      0.91      0.81        95
           1       0.76      0.47      0.58        59

    accuracy                           0.74       154
   macro avg       0.75      0.69      0.70       154
weighted avg       0.74      0.74      0.72       154



# The main parameters used by a Random Forest Classifier are:

- criterion = the function used to evaluate the quality of a split.
- max_depth = maximum number of levels allowed in each tree.
- max_features = maximum number of features considered when splitting a node.
- min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
- min_samples_split = minimum number of samples necessary in a node to cause node splitting.
- n_estimators = number of trees in the ensamble.

In [18]:
#manual hyper parameter tunning
model=RandomForestClassifier(n_estimators=100,criterion='gini',max_features='auto',min_samples_leaf=1,min_samples_split=2)

In [19]:
model.fit(x_train,y_train)
predictions=model.predict(x_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

[[85 10]
 [33 26]]
0.7207792207792207
              precision    recall  f1-score   support

           0       0.72      0.89      0.80        95
           1       0.72      0.44      0.55        59

    accuracy                           0.72       154
   macro avg       0.72      0.67      0.67       154
weighted avg       0.72      0.72      0.70       154



In [20]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [21]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200,stop=1500,num=10)]
# Number of features to consider at every split
max_features = ['auto','sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start=10,stop=1000,num=10)]
# Minimum number of samples required to split a node[]
max_samples_split = [2,5,7,10]
# Minimum number of samples required at each leaf node
max_samples_leaf = [1,2,6,4,5,9]
random_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':max_samples_split,
              'min_samples_leaf':max_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)






{'n_estimators': [200, 344, 488, 633, 777, 922, 1066, 1211, 1355, 1500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 7, 10], 'min_samples_leaf': [1, 2, 6, 4, 5, 9], 'criterion': ['entropy', 'gini']}


In [22]:
model_rf = RandomForestClassifier()
model_rf_randomcv=RandomizedSearchCV(estimator=model_rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2)
model_rf_randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=gini, max_depth=560, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=633; total time=   3.3s
[CV] END criterion=gini, max_depth=560, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=633; total time=   2.9s
[CV] END criterion=gini, max_depth=560, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=633; total time=   3.0s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=1355; total time=   4.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=1355; total time=   3.0s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=1355; total time=   2.9s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=10

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 6, 4, 5, 9],
                                        'min_samples_split': [2, 5, 7, 10],
                                        'n_estimators': [200, 344, 488, 633,
                                                         777, 922, 1066, 1211,
                                                         1355, 1500]},
                   verbose=2)

In [24]:
model_rf_randomcv.best_params_

{'n_estimators': 344,
 'min_samples_split': 5,
 'min_samples_leaf': 9,
 'max_features': 'auto',
 'max_depth': 230,
 'criterion': 'entropy'}

In [25]:
model_rf_randomcv

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 6, 4, 5, 9],
                                        'min_samples_split': [2, 5, 7, 10],
                                        'n_estimators': [200, 344, 488, 633,
                                                         777, 922, 1066, 1211,
                                                         1355, 1500]},
                   verbose=2)

In [26]:
best_random_grid=model_rf_randomcv.best_estimator_

In [27]:
from sklearn.metrics import accuracy_score
y_pred = best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[89  6]
 [31 28]]
Accuracy Score 0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.74      0.94      0.83        95
           1       0.82      0.47      0.60        59

    accuracy                           0.76       154
   macro avg       0.78      0.71      0.72       154
weighted avg       0.77      0.76      0.74       154



# #### GridSearch CV


In [28]:
model_rf_randomcv

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 6, 4, 5, 9],
                                        'min_samples_split': [2, 5, 7, 10],
                                        'n_estimators': [200, 344, 488, 633,
                                                         777, 922, 1066, 1211,
                                                         1355, 1500]},
                   verbose=2)

In [29]:
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid={
    'criterion':[model_rf_randomcv.best_params_['criterion']],
    'max_depth':[model_rf_randomcv.best_params_['max_depth']],
    'max_features':[model_rf_randomcv.best_params_['max_features']],
    'min_samples_leaf':[model_rf_randomcv.best_params_['min_samples_leaf'],
                        model_rf_randomcv.best_params_['min_samples_leaf']+2,
                        model_rf_randomcv.best_params_['min_samples_leaf']+4],
    'min_samples_split':[model_rf_randomcv.best_params_['min_samples_split'],
                         model_rf_randomcv.best_params_['min_samples_split']-2,
                         model_rf_randomcv.best_params_['min_samples_split']-1,
                         model_rf_randomcv.best_params_['min_samples_split']+1,
                         model_rf_randomcv.best_params_['min_samples_split']+2],
    'n_estimators':[model_rf_randomcv.best_params_['n_estimators']-200,
                    model_rf_randomcv.best_params_['n_estimators']-100,
                    model_rf_randomcv.best_params_['n_estimators'],
                    model_rf_randomcv.best_params_['n_estimators']+100,
                    model_rf_randomcv.best_params_['n_estimators']+200]
}
print(param_grid)                    
    
                        
                        



{'criterion': ['entropy'], 'max_depth': [230], 'max_features': ['auto'], 'min_samples_leaf': [9, 11, 13], 'min_samples_split': [5, 3, 4, 6, 7], 'n_estimators': [144, 244, 344, 444, 544]}


In [33]:
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)


AttributeError: 'dict' object has no attribute 'best_params_'

In [36]:
rf=RandomForestClassifier()
grid_serach = GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1,verbose=2)
grid_serach.fit(x_train,y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [230],
                         'max_features': ['auto'],
                         'min_samples_leaf': [9, 11, 13],
                         'min_samples_split': [5, 3, 4, 6, 7],
                         'n_estimators': [144, 244, 344, 444, 544]},
             verbose=2)

In [38]:
grid_serach.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=230, min_samples_leaf=11,
                       min_samples_split=5, n_estimators=144)

In [39]:
best_grid_search = grid_serach.best_estimator_

In [40]:
y_pred = best_grid_search.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[88  7]
 [30 29]]
Accuracy Score 0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.75      0.93      0.83        95
           1       0.81      0.49      0.61        59

    accuracy                           0.76       154
   macro avg       0.78      0.71      0.72       154
weighted avg       0.77      0.76      0.74       154



In [None]:
11) Create a model 
