In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
x = df.iloc[:,0:-1]
y = df['Outcome']

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

In [8]:
x = scaler.fit_transform(x)
x

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [30]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,
                                    test_size=0.25,
                                    random_state=12)


In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

In [33]:
y_pred = clf.predict(x_test)                

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(y_test,y_pred)

0.6614583333333334

## Hyperparameters tunning

### 1) Using Gridsearchcv

In [36]:
param_dist = {
    "criterion":['gini','entropy'],
    "max_depth":[1,2,3,4,5,6,7,None]
}

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
grid = GridSearchCV(clf,param_grid=param_dist,cv=10,n_jobs=-1)

In [39]:
grid.fit(x_train,y_train)

In [40]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [41]:
grid.best_estimator_

In [42]:
grid.best_score_

0.7621899576527525

In [54]:
grid.cv_results_

{'mean_fit_time': array([0.00166233, 0.00251682, 0.00403261, 0.00448062, 0.00402367,
        0.00450008, 0.00535917, 0.01069932, 0.00304987, 0.00451803,
        0.00647845, 0.00909827, 0.00576608, 0.01459332, 0.0086812 ,
        0.010273  ]),
 'std_fit_time': array([0.00225088, 0.00251697, 0.00201634, 0.00150095, 0.00201202,
        0.0015641 , 0.00183284, 0.01272298, 0.00441992, 0.00393549,
        0.00792624, 0.00480608, 0.00388944, 0.0115809 , 0.00287108,
        0.00150579]),
 'mean_score_time': array([0.00251036, 0.00251698, 0.        , 0.00100551, 0.00152361,
        0.00206623, 0.0013108 , 0.00354226, 0.00146635, 0.00130849,
        0.00203383, 0.        , 0.00359674, 0.00035534, 0.00121772,
        0.00146847]),
 'std_score_time': array([0.00336947, 0.00251704, 0.        , 0.00301652, 0.00232739,
        0.00208968, 0.00194951, 0.00900469, 0.00195655, 0.00165704,
        0.00322791, 0.        , 0.00538666, 0.00071939, 0.00099428,
        0.00318766]),
 'param_criterion': masked

## 2.Random search cv

In [43]:
param_dist = {
    "criterion":['gini','entropy'],
    "max_depth":[1,2,3,4,5,6,7,None]
}

In [44]:
from sklearn.model_selection import RandomizedSearchCV

In [45]:
random = RandomizedSearchCV(clf,param_distributions=param_dist,cv=10,n_jobs=-1)

In [46]:
random.fit(x_train,y_train)

In [47]:
random.best_score_

0.7587114337568057

In [48]:
random.best_params_

{'max_depth': 2, 'criterion': 'entropy'}

In [53]:
random.cv_results_

{'mean_fit_time': array([0.0055999 , 0.00239975, 0.00479891, 0.00159988, 0.00879898,
        0.00319958, 0.01199861, 0.00319932, 0.0047997 , 0.0055994 ]),
 'std_fit_time': array([0.00512186, 0.00366569, 0.0039183 , 0.00319977, 0.00240035,
        0.00391867, 0.00536589, 0.00391834, 0.00530624, 0.00366567]),
 'mean_score_time': array([0.00159986, 0.        , 0.00080011, 0.00079994, 0.        ,
        0.00159962, 0.00159957, 0.00239937, 0.0015996 , 0.00159974]),
 'std_score_time': array([0.00319972, 0.        , 0.00240033, 0.00239983, 0.        ,
        0.00319924, 0.00479872, 0.0036651 , 0.0031992 , 0.00319948]),
 'param_max_depth': masked_array(data=[4, 1, 7, 4, 7, 2, None, 1, 3, 6],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_criterion': masked_array(data=['entropy', 'gini', 'gini', 'gini', 'entropy',
                    'entropy', 'entropy', 'entropy', 'gin