In [1]:
import pandas as pd
data = pd.read_csv('../datasets/diabetes.csv',sep=',')
# Statistics

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Split data

In [4]:
# First way (use iloc)
y = data.iloc[:,-1]

In [5]:
x = data.iloc[:,:-1]
x[:5]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [13]:
# Second way
x = data.drop('Outcome', axis=1)
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [14]:
y = data['Outcome'] 
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

### Split for train & test

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_train.shape)

(614, 8)
(614, 8)


### Scaling

In [16]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [17]:
x_train

array([[-0.52639686, -1.15139792, -3.75268255, ..., -4.13525578,
        -0.49073479, -1.03594038],
       [ 1.58804586, -0.27664283,  0.68034485, ..., -0.48916881,
         2.41502991,  1.48710085],
       [-0.82846011,  0.56687102, -1.2658623 , ..., -0.42452187,
         0.54916055, -0.94893896],
       ...,
       [ 1.8901091 , -0.62029661,  0.89659009, ...,  1.76054443,
         1.981245  ,  0.44308379],
       [-1.13052335,  0.62935353, -3.75268255, ...,  1.34680407,
        -0.78487662, -0.33992901],
       [-1.13052335,  0.12949347,  1.43720319, ..., -1.22614383,
        -0.61552223, -1.03594038]])

### Using GridSearchCV to find the best parameters

In [19]:
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train,y_train)

In [20]:
y_pred = clf.predict(x_test)
print(y_pred.shape)
print(y_test.shape)

(154,)
(154,)


In [28]:
# Use grid search cv (get the best param option)
param_grid = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy', 'log_loss']
}
grid_search = GridSearchCV(clf,param_grid,cv=4, verbose=4, n_jobs=-1)
grid_search.fit(x_train, y_train)
y_pred = grid_search.predict(x_test)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        99
           1       0.67      0.67      0.67        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



In [26]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'entropy', 'n_estimators': 50}
0.793120278414396


In [30]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting click (from lazypredict)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting lightgbm (from lazypredict)
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting xgboost (from lazypredict)
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow>=2.0.0->lazypredict)
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 k



In [22]:
# # Visualize confusion matrix
# cm = confusion_matrix(y_test, y_pred)
# sns.heatmap(data=cm,annot=True,xticklabels=["Not diabetic", "Dibetic"],
#             yticklabels=["Not Diabetic", "Diabetic"])
# plt.savefig("diabetes.png")
# plt.show()

### Finding optimal parameters on many models

In [31]:
from lazypredict.Supervised import LazyClassifier

In [32]:
clf = LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)
models

  0%|          | 0/32 [00:00<?, ?it/s]



Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.78,0.77,0.77,0.78,0.2
QuadraticDiscriminantAnalysis,0.78,0.76,0.76,0.78,0.02
GaussianNB,0.77,0.75,0.75,0.77,0.13
RidgeClassifierCV,0.77,0.75,0.75,0.77,0.03
DecisionTreeClassifier,0.75,0.74,0.74,0.75,0.02
RidgeClassifier,0.76,0.74,0.74,0.76,0.02
LinearDiscriminantAnalysis,0.76,0.74,0.74,0.76,0.03
LogisticRegression,0.75,0.74,0.74,0.75,0.02
LinearSVC,0.75,0.74,0.74,0.75,0.02
CalibratedClassifierCV,0.75,0.73,0.73,0.75,0.06
