In [2]:
from pandas import read_csv, get_dummies
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report, confusion_matrix

df = read_csv("./data/diabetes.csv")
df.tail()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [3]:
X = df.drop("diabetes", axis=1)
y = df.diabetes

In [4]:
df.head()
#drop all the NA cols
#df.dropna()
#if you want to change NA values with median/ mode/ mean we can use imputer method available in SciKit Learn

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((614, 8), (614,), (154, 8), (154,))

In [6]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures()),
    ("Scaler", StandardScaler()),
    ("RF", RandomForestClassifier())
])
#class weight based on the proportions of number of samples
params = {
    "poly__degree" : [1,2],
    "poly__interaction_only" : [True, False],
    "RF__n_estimators" : [5, 10, 20],
    "RF__max_features" : [0.5, None, "sqrt"],
    "RF__min_samples_leaf" : [1, 3],
    "RF__class_weight" : ["balanced"]
}

#scoring f1 weighted is for multiclass

#n_jobs = -1 use all processors, 2 = use 2 processors
RF = GridSearchCV(pipeline,param_grid=params,cv=3, scoring="f1_weighted", verbose=1, n_jobs=-1)
RF.fit(X_train,y_train)

RF.best_params_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 201 out of 216 | elapsed:    7.8s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:    8.0s finished


{'RF__class_weight': 'balanced',
 'RF__max_features': 0.5,
 'RF__min_samples_leaf': 3,
 'RF__n_estimators': 20,
 'poly__degree': 1,
 'poly__interaction_only': False}

In [7]:
RF.grid_scores_



[mean: 0.72188, std: 0.00997, params: {'RF__n_estimators': 5, 'RF__class_weight': 'balanced', 'poly__degree': 1, 'RF__min_samples_leaf': 1, 'RF__max_features': 0.5, 'poly__interaction_only': True},
 mean: 0.75256, std: 0.02507, params: {'RF__n_estimators': 5, 'RF__class_weight': 'balanced', 'poly__degree': 1, 'RF__min_samples_leaf': 1, 'RF__max_features': 0.5, 'poly__interaction_only': False},
 mean: 0.72804, std: 0.02073, params: {'RF__n_estimators': 5, 'RF__class_weight': 'balanced', 'poly__degree': 2, 'RF__min_samples_leaf': 1, 'RF__max_features': 0.5, 'poly__interaction_only': True},
 mean: 0.73037, std: 0.01174, params: {'RF__n_estimators': 5, 'RF__class_weight': 'balanced', 'poly__degree': 2, 'RF__min_samples_leaf': 1, 'RF__max_features': 0.5, 'poly__interaction_only': False},
 mean: 0.73758, std: 0.01830, params: {'RF__n_estimators': 10, 'RF__class_weight': 'balanced', 'poly__degree': 1, 'RF__min_samples_leaf': 1, 'RF__max_features': 0.5, 'poly__interaction_only': True},
 mean: 

In [8]:
RF.score(X_train, y_train), RF.score(X_test, y_test)

(0.9413680781758957, 0.7686268472906403)

In [9]:
classification_report_train = classification_report(y_train, RF.predict(X_train))
classification_report_train

'             precision    recall  f1-score   support\n\n          0       0.96      0.96      0.96       401\n          1       0.92      0.92      0.92       213\n\navg / total       0.94      0.94      0.94       614\n'

In [10]:
classification_report_test = classification_report(y_test, RF.predict(X_test))
classification_report_test

'             precision    recall  f1-score   support\n\n          0       0.84      0.79      0.81        99\n          1       0.66      0.73      0.69        55\n\navg / total       0.77      0.77      0.77       154\n'

In [11]:
#for multiclass classification the average should be None
f1_train = f1_score(y_train, RF.predict(X_train), average=None)
f1_train

array([0.95511222, 0.91549296])

In [12]:
#for multiclass classification the average should be None
f1_test = f1_score(y_test, RF.predict(X_test), average=None)
f1_test

array([0.8125    , 0.68965517])

In [13]:
confusion_matrix_train = confusion_matrix(y_train, RF.predict(X_train), labels=[0,1])
confusion_matrix_train

array([[383,  18],
       [ 18, 195]], dtype=int64)

In [14]:
confusion_matrix_test = confusion_matrix(y_test, RF.predict(X_test), labels=[0,1])
confusion_matrix_test

array([[78, 21],
       [15, 40]], dtype=int64)