In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reading the dataset
df = pd.read_csv('diabetes.csv')

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
df.shape

(768, 9)

In [5]:
# Checking the values of various features of the dataset
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [7]:
# Checking for null values
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
# Creating dependent and independent variables
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

In [9]:
X.shape

(768, 8)

In [10]:
Y.shape

(768,)

In [11]:
# Scaling the data
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# X = scaler.fit_transform(X)

In [12]:
# Splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

In [13]:
# Building a basic Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50).fit(X_train, Y_train)
prediction = rf.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, prediction))
print(classification_report(Y_test, prediction))
print(accuracy_score(Y_test, prediction))

[[83 13]
 [18 40]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        96
           1       0.75      0.69      0.72        58

    accuracy                           0.80       154
   macro avg       0.79      0.78      0.78       154
weighted avg       0.80      0.80      0.80       154

0.7987012987012987


### Manual Hyper Tuning

In [15]:
%%time
model = RandomForestClassifier(n_estimators = 500, criterion = 'gini', max_features = 'sqrt', min_samples_leaf = 10, random_state = 123).fit(X_train, Y_train)
prediction = model.predict(X_test)

Wall time: 485 ms


In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, prediction))
print(classification_report(Y_test, prediction))
print(accuracy_score(Y_test, prediction))

[[87  9]
 [24 34]]
              precision    recall  f1-score   support

           0       0.78      0.91      0.84        96
           1       0.79      0.59      0.67        58

    accuracy                           0.79       154
   macro avg       0.79      0.75      0.76       154
weighted avg       0.79      0.79      0.78       154

0.7857142857142857


#### Randomized Search CV

RandomizedSearchCV implements a “fit” and a “score” method. It also implements “score_samples”, “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings.

In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.

If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters.

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Mentioning the number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Features to consider at every split
max_features = ['auto', 'sqrt', 'log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]

# Minimum number of samples required to split the node
min_samples_split = [1, 2, 3, 4, 5, 7, 9]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8]

# Creating the random grid
random_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'criterion': ['gini', 'entropy']
}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 2, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['gini', 'entropy']}


In [18]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose = 2, n_jobs = -1, random_state = 123)

In [19]:
%%time
# Fit the model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


 0.74596684 0.74920293 0.75735693 0.75409692 0.75738881 0.74759286
 0.75247888 0.74597481 0.75575482 0.74922685 0.75573091 0.76224295
        nan 0.75410489 0.75247091 0.75247888 0.75734896 0.75247888
 0.75086083 0.75737287 0.75410489 0.75085286 0.75575482 0.75249482
 0.74108879 0.75247091 0.75247091 0.75247091 0.7459509  0.74594293
        nan        nan 0.75573091 0.74432488 0.75084489 0.75086083
 0.75247888 0.75086083 0.75248685 0.75899091 0.7476088         nan
 0.7492109  0.76549498 0.74757692 0.75409692 0.74762474 0.76386896
 0.75250279 0.75572294 0.74594293 0.75899091 0.75086083 0.76225092
 0.74923482 0.75899091 0.75409692 0.7492109         nan 0.75247091
 0.75249482 0.75412881 0.75574685 0.76061693 0.75899091        nan
 0.75735693 0.76551092        nan 0.75086083 0.75575482 0.75249482
 0.75412084 0.75248685 0.7476088  0.75085286 0.74269887 0.75082895
 0.75899888 0.74920293 0.75737287 0.76061693        nan 0.75412084
 0.75247888 0.75733301 0.75573888 0.75898294        nan 0.7492

Wall time: 47 s


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 2, 3, 4, 5, 7,
                                                              9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   

In [20]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 6,
 'max_features': 'auto',
 'max_depth': 560,
 'criterion': 'gini'}

In [21]:
rf_random.best_estimator_

RandomForestClassifier(max_depth=560, min_samples_leaf=6, n_estimators=1000)

In [22]:
Y_pred = rf_random.best_estimator_.predict(X_test)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[84 12]
 [20 38]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84        96
           1       0.76      0.66      0.70        58

    accuracy                           0.79       154
   macro avg       0.78      0.77      0.77       154
weighted avg       0.79      0.79      0.79       154

0.7922077922077922


#### Grid Search CV

In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_random.best_params_['criterion']],
    'max_depth': [rf_random.best_params_['max_depth']],
    'max_features': [rf_random.best_params_['max_features']],
    'min_samples_leaf': [rf_random.best_params_['min_samples_leaf'],
                         rf_random.best_params_['min_samples_leaf'] + 2,
                         rf_random.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_random.best_params_['min_samples_split'] - 2,
                          rf_random.best_params_['min_samples_split'] - 1,
                          rf_random.best_params_['min_samples_split'],
                          rf_random.best_params_['min_samples_split'] + 1,
                          rf_random.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_random.best_params_['n_estimators'] - 200,
                     rf_random.best_params_['n_estimators'] - 100,
                     rf_random.best_params_['n_estimators'],
                     rf_random.best_params_['n_estimators'] + 100,
                     rf_random.best_params_['n_estimators'] + 200, 
                     rf_random.best_params_['n_estimators'] - 600,] 
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [560], 'max_features': ['auto'], 'min_samples_leaf': [6, 8, 10], 'min_samples_split': [0, 1, 2, 3, 4], 'n_estimators': [800, 900, 1000, 1100, 1200, 400]}


In [25]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2)

In [26]:
%%time
grid_search.fit(X_train, Y_train)

Fitting 10 folds for each of 90 candidates, totalling 900 fits


        nan        nan        nan        nan        nan        nan
 0.76388154 0.76549445 0.77038604 0.76710735 0.76874669 0.76708091
 0.76710735 0.76874669 0.76874669 0.76716023 0.76708091 0.76872025
 0.76057641 0.76874669 0.76877314 0.76388154 0.7622422  0.75737705
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.75737705 0.7606293  0.76388154 0.76549445 0.76221576 0.75737705
 0.76549445 0.76382866 0.76221576 0.76549445 0.76221576 0.75576415
 0.76057641 0.76708091 0.7638551  0.75737705 0.76060286 0.76544157
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.75904283 0.7622422  0.76390799 0.75576415 0.75901639 0.76229508
 0.76229508 0.7606293  0.75898995 0.75571126 0.76708091 0.76060286
 0.7622422  0.76390799 0.76065574 0.7606293  0.76065574 0.7606293 ]


Wall time: 1min 19s


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [560],
                         'max_features': ['auto'],
                         'min_samples_leaf': [6, 8, 10],
                         'min_samples_split': [0, 1, 2, 3, 4],
                         'n_estimators': [800, 900, 1000, 1100, 1200, 400]},
             verbose=2)

In [27]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=560, min_samples_leaf=6, n_estimators=1000)

In [28]:
best_grid = grid_search.best_estimator_

In [29]:
Y_pred = best_grid.predict(X_test)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[83 13]
 [21 37]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83        96
           1       0.74      0.64      0.69        58

    accuracy                           0.78       154
   macro avg       0.77      0.75      0.76       154
weighted avg       0.78      0.78      0.78       154

0.7792207792207793


### Automatic Hyper Tuning

Automated Hyperparameter Tuning can be done by using techniques such as

- Bayesian Optimization
- Gradient Descent
- Evolutionary Algorithms

HyperOpt is an open-source Python library for Bayesian optimization developed by James Bergstra. It is designed for large-scale optimization for models with hundreds of parameters and allows the optimization procedure to be scaled across multiple cores and multiple machines.

In [31]:
!pip install hyperopt



In [32]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [33]:
## hp is used to define whether we are defining interger values, floating values, or choice function
space = {
    'criterion': hp.choice('criterion', ['entropy', 'gini']),
    'max_depth': hp.quniform('max_depth', 10, 1200, 10),
    'max_featuers': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'n_estimators': hp.choice('n_estimators', [10, 50, 300, 750, 1200, 1300, 1500])
}

In [34]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x278dd00d190>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x278dd00df40>,
 'max_featuers': <hyperopt.pyll.base.Apply at 0x278dd00d1f0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x278dd0047c0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x278dd004eb0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x278dd004df0>}

In [35]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                  max_features = space['max_featuers'], min_samples_leaf = space['min_samples_leaf'],
                                  n_estimators = space['n_estimators'])

    accuracy = cross_val_score(model, X_train, Y_train, cv = 5).mean()
    
    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK}

In [36]:
from sklearn.model_selection import cross_val_score
trials = Trials() # it is responsible for minimizing the function
best = fmin(fn = objective,           
           space = space, 
           algo = tpe.suggest, 
           max_evals = 80, 
           trials = trials)
best

100%|███████████████████████████████████████████████| 80/80 [05:22<00:00,  4.04s/trial, best loss: -0.7720378515260562]


{'criterion': 0,
 'max_depth': 650.0,
 'max_features': 0,
 'min_samples_leaf': 0.00040842969310787933,
 'min_samples_split': 0.00834048386925039,
 'n_estimators': 4}

In [37]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0:'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0:10, 1: 50, 2:300, 4: 1200, 5:1300, 6: 1500}

print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

entropy
auto
1200


In [38]:
%%time
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'],
                                      max_features = feat[best['max_features']], min_samples_leaf = best['min_samples_leaf'],
                                      min_samples_split = best['min_samples_split'], n_estimators = est[best['n_estimators']]).fit(X_train, Y_train)

Wall time: 1.36 s


In [39]:
predictionforest = trainedforest.predict(X_test)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, predictionforest))
print(classification_report(Y_test, predictionforest))
print(accuracy_score(Y_test, predictionforest))

[[83 13]
 [18 40]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        96
           1       0.75      0.69      0.72        58

    accuracy                           0.80       154
   macro avg       0.79      0.78      0.78       154
weighted avg       0.80      0.80      0.80       154

0.7987012987012987
