In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.svm import SVC
import plotly

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [5]:
df.columns=['Pregnancies','Glucose','Blood Pressure','Skin Thickness','Insulin','BMI','Diabetes Pedigree Function','Age','Class']

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Pregnancies                 767 non-null    int64  
 1   Glucose                     767 non-null    int64  
 2   Blood Pressure              767 non-null    int64  
 3   Skin Thickness              767 non-null    int64  
 4   Insulin                     767 non-null    int64  
 5   BMI                         767 non-null    float64
 6   Diabetes Pedigree Function  767 non-null    float64
 7   Age                         767 non-null    int64  
 8   Class                       767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Class
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.842243,120.859192,69.101695,20.517601,79.90352,31.990482,0.471674,33.219035,0.34811
std,3.370877,31.978468,19.368155,15.954059,115.283105,7.889091,0.331497,11.752296,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.371,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
df[['Glucose','Blood Pressure','Skin Thickness','Insulin','BMI']]=df[['Glucose','Blood Pressure','Skin Thickness','Insulin','BMI']].replace(0,np.NAN)

In [10]:
df.isna().sum()

Pregnancies                     0
Glucose                         5
Blood Pressure                 35
Skin Thickness                227
Insulin                       373
BMI                            11
Diabetes Pedigree Function      0
Age                             0
Class                           0
dtype: int64

In [11]:
imputed_cols=['Glucose','Blood Pressure','Skin Thickness','Insulin','BMI']
imputer=SimpleImputer(missing_values=np.NAN,strategy='median')
transformer=ColumnTransformer([('imputer',imputer,imputed_cols)],remainder='passthrough')
df_transformed=transformer.fit_transform(df)


In [12]:
df=pd.DataFrame(df_transformed)

In [13]:
df.columns=['Pregnancies','Glucose','Blood Pressure','Skin Thickness','Insulin','BMI','Diabetes Pedigree Function','Age','Class']

In [14]:
df.head()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Class
0,85.0,66.0,29.0,125.0,26.6,1.0,0.351,31.0,0.0
1,183.0,64.0,29.0,125.0,23.3,8.0,0.672,32.0,1.0
2,89.0,66.0,23.0,94.0,28.1,1.0,0.167,21.0,0.0
3,137.0,40.0,35.0,168.0,43.1,0.0,2.288,33.0,1.0
4,116.0,74.0,29.0,125.0,25.6,5.0,0.201,30.0,0.0


In [15]:
df.isna().sum()

Pregnancies                   0
Glucose                       0
Blood Pressure                0
Skin Thickness                0
Insulin                       0
BMI                           0
Diabetes Pedigree Function    0
Age                           0
Class                         0
dtype: int64

In [16]:
X=df.drop('Class',axis=1)
y=df['Class']

In [17]:
print(X.shape)
print(y.shape)

(767, 8)
(767,)


In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
rf_classifer=RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
predictions=rf_classifer.predict(X_test)

In [20]:
predictions_series=pd.Series(predictions)

In [21]:
predictions_series.value_counts()

0.0    103
1.0     51
dtype: int64

In [22]:
y.value_counts()

0.0    500
1.0    267
Name: Class, dtype: int64

In [23]:
confusion_matrix(y_test,predictions)

array([[80, 17],
       [22, 35]], dtype=int64)

In [24]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.78      0.82      0.80        97
         1.0       0.67      0.61      0.64        57

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.74      0.75      0.74       154



In [25]:
RandomForestClassifier()

RandomForestClassifier()

In [26]:
n_estimators=np.linspace(10,1000,num=100).astype(np.int64)
max_depth=np.linspace(1,100,num=100)
min_samples_split=[1,5,10,20,50]
max_features=['auto','sqrt','log2']
criterion=['gini','entropy']
min_samples_leaf=[1,5,10,15,20]
param_grid={'n_estimators':n_estimators,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'max_features':max_features,
            'criterion':criterion,
            'min_samples_leaf':min_samples_leaf
           
           }



In [27]:
rf=RandomForestClassifier()
rf_cv_tuned=RandomizedSearchCV(estimator=rf,param_distributions=param_grid,n_iter=10,cv=3,random_state=100,n_jobs=-1)
rf_cv_tuned.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55....
        120,  130,  140,  150,  160,  170,  180,  190,  200,  210,  220,
        230,  240,  250,  260,  270,  280,  290,  300,  310,  320,  330,
        340,  350,  360,  370,  380,  390,  400,  410,  420,  430,  440,
        450,  460,  470,  480,  490,  500,  510,  520,  530,  540,  550,
        560,  570,  580,  590,  600,  610,  620,  630,  640,  650,  660,
        670,  680,  690,  700,  710,  720,  730,  740,  750,  760,  7

In [28]:
rf_cv_tuned.best_params_

{'n_estimators': 990,
 'min_samples_split': 50,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 85.0,
 'criterion': 'gini'}

In [29]:
y_preds=rf_cv_tuned.predict(X_test)

In [30]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

         0.0       0.79      0.85      0.82        97
         1.0       0.70      0.61      0.65        57

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



In [31]:
param_grid={'max_features':[rf_cv_tuned.best_params_['max_features']],
           'criterion':[rf_cv_tuned.best_params_['criterion']],
           'min_samples_split' :[rf_cv_tuned.best_params_['min_samples_split'],
                                rf_cv_tuned.best_params_['min_samples_split']-10,
                                rf_cv_tuned.best_params_['min_samples_split']-20,
                                rf_cv_tuned.best_params_['min_samples_split']+10,
                                rf_cv_tuned.best_params_['min_samples_split']+20],
            'min_samples_leaf' :[rf_cv_tuned.best_params_['min_samples_leaf'],
                                rf_cv_tuned.best_params_['min_samples_leaf']+4,
                                rf_cv_tuned.best_params_['min_samples_leaf']+9],
            'max_depth'        :[rf_cv_tuned.best_params_['max_depth'],
                                rf_cv_tuned.best_params_['max_depth']-8],
            'n_estimators'     :[rf_cv_tuned.best_params_['n_estimators'],
                                rf_cv_tuned.best_params_['n_estimators']-50,
                                rf_cv_tuned.best_params_['n_estimators']+50,
                                rf_cv_tuned.best_params_['n_estimators']-150,
                                rf_cv_tuned.best_params_['n_estimators']-250]
         
           
           }

In [32]:
rf=RandomForestClassifier()
rf_grid=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=True)
rf_grid.fit(X_train,y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  9.3min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [85.0, 77.0],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 5, 10],
                         'min_samples_split': [50, 40, 30, 60, 70],
                         'n_estimators': [990, 940, 1040, 840, 740]},
             verbose=True)

In [33]:
best_grid=rf_grid.best_estimator_

In [34]:
best_grid.fit(X_train,y_train)

RandomForestClassifier(max_depth=77.0, min_samples_leaf=5, min_samples_split=40,
                       n_estimators=940)

In [35]:
preds_y=best_grid.predict(X_test)

In [36]:
print(classification_report(y_test,preds_y))

              precision    recall  f1-score   support

         0.0       0.80      0.85      0.82        97
         1.0       0.71      0.63      0.67        57

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154



In [37]:
space={'criterion':hp.choice('criterion',['gini','entropy']),
       'n_estimators':hp.choice('n_estimators',[10,50,100,200,500,1000]),
       'max_depth':hp.quniform('max_depth',10,1500,15),
       'max_features':hp.choice('max_features',['auto','sqrt','log',None]),
       'min_samples_split':hp.uniform('min_samples_split',0,1),
       'min_samples_leaf':hp.uniform('min_samples_leaf',0,0.8)
      
      }

In [38]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x25b3b617610>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x25b3b583160>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x25b3b58ec10>,
 'max_features': <hyperopt.pyll.base.Apply at 0x25b3b5a2af0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x25b3b626790>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x25b3b626700>}

In [39]:
def objective(space):
    model=RandomForestClassifier(criterion=space['criterion'],n_estimators=space['n_estimators'],max_depth=space['max_depth'],max_features=space['max_features'],min_samples_split=space['min_samples_split'],min_samples_leaf=space['min_samples_leaf'])
    accuracy=cross_val_score(model,X_train,y_train,cv=5).mean()
    return {'loss': -accuracy,'status':STATUS_OK}
    
    

In [40]:
trials=Trials()
best=fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=80,trials=trials,verbose=True)


100%|███████████████████████████████████████████████| 80/80 [02:10<00:00,  1.63s/trial, best loss: -0.7504198320671731]


In [23]:
from tpot import TPOTClassifier
n_estimators=np.linspace(10,1000,num=100).astype(np.int64)
max_depth=np.linspace(1,100,num=100)
min_samples_split=[1,5,10,20,50]
max_features=['auto','sqrt','log2']
criterion=['gini','entropy']
min_samples_leaf=[1,5,10,15,20]
param_grid={'n_estimators':n_estimators,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'max_features':max_features,
            'criterion':criterion,
            'min_samples_leaf':min_samples_leaf
           
           }

In [24]:
clf=TPOTClassifier(generations=5,population_size=40,offspring_size=20,scoring='accuracy',cv=5,n_jobs=-1,config_dict={'sklearn.ensemble.RandomForestClassifier':param_grid},early_stop=12,verbosity=2)
clf.fit(X_train,y_train)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=140.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.758576569372251

Generation 2 - Current best internal CV score: 0.758576569372251

Generation 3 - Current best internal CV score: 0.758576569372251

Generation 4 - Current best internal CV score: 0.758576569372251

Generation 5 - Current best internal CV score: 0.758576569372251

Best pipeline: RandomForestClassifier(RandomForestClassifier(input_matrix, criterion=gini, max_depth=2.0, max_features=auto, min_samples_leaf=15, min_samples_split=5, n_estimators=460), criterion=entropy, max_depth=86.0, max_features=auto, min_samples_leaf=1, min_samples_split=50, n_estimators=970)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['gini',
                                                                                      'entropy'],
                                                                        'max_depth': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60...
        340,  350,  360,  370,  380,  390,  400,  410,  420,  430,  440,
        450,  460,  470,  480,  490,  500,  510,  520,  530,  540,  550,
        560,  570,  580,  590,  600,  610,  620,  630,  640,  650,  660,
        670,  680,  690,  700,  710,  720,  730,  740,  750,  760,  770,
        780,  790,  800,  81

In [28]:
import optuna

In [38]:
def objective(trial):
    classifier=trial.suggest_categorical('classifier',['RandomForest','SVC'])
    if classifier=='RandomForest':
        n_estimators=trial.suggest_int('n_estimators',10,1000,100)
        max_depth=trial.suggest_int('max_depth',1,100,log=True)
        criterion=trial.suggest_categorical('criterion',['gini','entropy'])
        clf=RandomForestClassifier(n_estimators=n_estimators,criterion=criterion,max_depth=max_depth)
        
    else:
        c=trial.suggest_float('svc_c',1e-10,1e10,log=True)
        clf=SVC(C=c,gamma='auto')
    return cross_val_score(clf,X_train,y_train,cv=5,n_jobs=-1).mean()
    

In [39]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=80)
trial=study.best_trial

[32m[I 2020-12-17 17:54:33,573][0m A new study created in memory with name: no-name-4898ff91-e2f4-417d-9d8d-bf043825745b[0m
[32m[I 2020-12-17 17:54:37,574][0m Trial 0 finished with value: 0.6574170331867253 and parameters: {'classifier': 'SVC', 'svc_c': 2.8696318269144784e-09}. Best is trial 0 with value: 0.6574170331867253.[0m
[32m[I 2020-12-17 17:54:37,676][0m Trial 1 finished with value: 0.6574170331867253 and parameters: {'classifier': 'SVC', 'svc_c': 6907738859.1498}. Best is trial 0 with value: 0.6574170331867253.[0m
[32m[I 2020-12-17 17:54:38,537][0m Trial 2 finished with value: 0.7422364387578302 and parameters: {'classifier': 'RandomForest', 'n_estimators': 110, 'max_depth': 3, 'criterion': 'gini'}. Best is trial 2 with value: 0.7422364387578302.[0m
[32m[I 2020-12-17 17:54:38,701][0m Trial 3 finished with value: 0.6574170331867253 and parameters: {'classifier': 'SVC', 'svc_c': 10944856.037614321}. Best is trial 2 with value: 0.7422364387578302.[0m
[32m[I 2020-1

[32m[I 2020-12-17 17:55:58,551][0m Trial 34 finished with value: 0.7487538318006131 and parameters: {'classifier': 'RandomForest', 'n_estimators': 310, 'max_depth': 4, 'criterion': 'gini'}. Best is trial 17 with value: 0.7536185525789684.[0m
[32m[I 2020-12-17 17:55:58,646][0m Trial 35 finished with value: 0.6574170331867253 and parameters: {'classifier': 'SVC', 'svc_c': 0.0033781862379100845}. Best is trial 17 with value: 0.7536185525789684.[0m
[32m[I 2020-12-17 17:55:59,430][0m Trial 36 finished with value: 0.7552978808476609 and parameters: {'classifier': 'RandomForest', 'n_estimators': 110, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 36 with value: 0.7552978808476609.[0m
[32m[I 2020-12-17 17:56:00,121][0m Trial 37 finished with value: 0.7323870451819273 and parameters: {'classifier': 'RandomForest', 'n_estimators': 110, 'max_depth': 2, 'criterion': 'gini'}. Best is trial 36 with value: 0.7552978808476609.[0m
[32m[I 2020-12-17 17:56:00,298][0m Trial 38 finished 

[32m[I 2020-12-17 17:57:15,187][0m Trial 68 finished with value: 0.7422097827535652 and parameters: {'classifier': 'RandomForest', 'n_estimators': 410, 'max_depth': 3, 'criterion': 'gini'}. Best is trial 61 with value: 0.7650539784086365.[0m
[32m[I 2020-12-17 17:57:18,715][0m Trial 69 finished with value: 0.7438624550179929 and parameters: {'classifier': 'RandomForest', 'n_estimators': 510, 'max_depth': 9, 'criterion': 'gini'}. Best is trial 61 with value: 0.7650539784086365.[0m
[32m[I 2020-12-17 17:57:20,158][0m Trial 70 finished with value: 0.7519658803145408 and parameters: {'classifier': 'RandomForest', 'n_estimators': 210, 'max_depth': 13, 'criterion': 'gini'}. Best is trial 61 with value: 0.7650539784086365.[0m
[32m[I 2020-12-17 17:57:20,270][0m Trial 71 finished with value: 0.7716246834599493 and parameters: {'classifier': 'RandomForest', 'n_estimators': 10, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 71 with value: 0.7716246834599493.[0m
[32m[I 2020-12-17 1

In [43]:
print('Accuracy:{}'.format(trial.value))
print('Best Parameters{}'.format(trial.params))

Accuracy:0.7716246834599493
Best Parameters{'classifier': 'RandomForest', 'n_estimators': 10, 'max_depth': 5, 'criterion': 'gini'}


In [44]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 10,
 'max_depth': 5,
 'criterion': 'gini'}

In [45]:
rf=RandomForestClassifier(n_estimators=10,max_depth=5,criterion='gini')

In [47]:
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, n_estimators=10)

In [48]:
y_preds=rf.predict(X_test)

In [50]:
print(accuracy_score(y_test,y_preds))
print(confusion_matrix(y_test,y_preds))
print(classification_report(y_test,y_preds))

0.7402597402597403
[[81 16]
 [24 33]]
              precision    recall  f1-score   support

         0.0       0.77      0.84      0.80        97
         1.0       0.67      0.58      0.62        57

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154



ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.