In [1]:
# import pandas
import pandas as pd
import numpy as np

In [2]:
# list for column headers
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# load data
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)

* explore the dataset
    - head
    - shape

In [3]:
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


* create X and y (the goal is to predict column **class** based on other variables)

In [5]:
X, y = df.iloc[:,:-1],df.iloc[:,-1]

* split data set to train set and test set

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.75,random_state=123)

* import RandomForestClassifier from sklearn

In [7]:
from sklearn.ensemble import RandomForestClassifier

* create model

In [8]:
rf = RandomForestClassifier()

* fit training set with default parameters

In [9]:
rf.fit(X_train,y_train)

RandomForestClassifier()

* predict X_test

In [10]:
y_pred=rf.predict(X_test)

* import roc_auc_score and confusion_matrix from sklearn

In [11]:
from sklearn import metrics

* print confusion matrix

In [12]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[101,  18],
       [ 23,  50]], dtype=int64)

* print AUC

In [13]:
auc_score = metrics.roc_auc_score(y_test,y_pred)
auc_score

0.7668355013238173

* import GridSearchCV from sklearn

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

* create grid (optimize for number of trees and max depth in one tree)

In [17]:
param_grid = {
    'n_estimators':[50,100,150,200,250,300],
    'max_depth':[1,3,5,7,9,11]
}

* fit training data with grid search

In [18]:
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train,y_train)
best_model = grid_result.best_estimator_ 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 173 out of 180 | elapsed:   13.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   14.2s finished


In [19]:
grid_result.best_params_

{'max_depth': 7, 'n_estimators': 250}

In [23]:
param_grid = {
    'n_estimators':[200,225,250,275,300],
    'max_depth':[5,6,7,8,9]
}

In [24]:
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train,y_train)
best_model = grid_result.best_estimator_ 

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   12.5s finished


In [25]:
grid_result.best_params_

{'max_depth': 7, 'n_estimators': 200}

In [26]:
y_pred2=best_model.predict(X_test)

* print confusion matrix with the best model

In [27]:
cnf_matrix2 = metrics.confusion_matrix(y_test, y_pred2)
cnf_matrix2

array([[101,  18],
       [ 24,  49]], dtype=int64)

* print AUC with the best model

In [28]:
auc_score2 = metrics.roc_auc_score(y_test,y_pred2)
auc_score2

0.759986186255324

In [95]:
rf3 = RandomForestClassifier(n_estimators = 200, max_depth=7)

In [96]:
rf3.fit(X_train,y_train)

RandomForestClassifier(max_depth=7, n_estimators=200)

In [97]:
y_pred3=rf3.predict(X_test)

In [98]:
cnf_matrix3 = metrics.confusion_matrix(y_test, y_pred3)
cnf_matrix3

array([[103,  16],
       [ 23,  50]], dtype=int64)

In [99]:
auc_score3 = metrics.roc_auc_score(y_test,y_pred3)
auc_score3

0.7752388626683551

- is the model better than default?

In [None]:
yes