In [1]:
import pandas as pd
import numpy as np
import os
import openpyxl

from data_preprocessing import encode_categorical_features
from sampling import create_stratified_kfolds
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, \
    matthews_corrcoef, mean_squared_error, r2_score, roc_auc_score, roc_curve, auc
from math import sqrt

from lazypredict.Supervised import LazyClassifier

In [2]:
# notebook parameters
input_dataset_path = "data/heart_disease_health_indicators_BRFSS2015.csv"
target_col = "HeartDiseaseorAttack"
generate_new_folds = False
n_splits = 5
k_best_features = 10

### Loading dataset

In [3]:
heart_df = pd.read_csv(input_dataset_path)
heart_df[target_col] = heart_df[target_col].astype(int)
heart_df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
# general dataset descriptors
print(f"Input dataset has {heart_df.shape[0]} rows and {heart_df.shape[1]} colums")
print(f"Input dataset consists of {heart_df.drop(columns=[target_col]).shape[1]} features and 1 target column")

print(f"Target values are: {heart_df[target_col].unique()}")
print(f"Input dataset contains {heart_df[heart_df.duplicated()].shape[0]} duplicated rows and {heart_df[heart_df.duplicated()==False].shape[0]} unique rows")

Input dataset has 253680 rows and 22 colums
Input dataset consists of 21 features and 1 target column
Target values are: [0 1]
Input dataset contains 23899 duplicated rows and 229781 unique rows


In [5]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  int32  
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

### Data preprocessing

In [6]:
# delete all duplicated values within the dataset
heart_df.drop_duplicates(inplace=True)

In [7]:
# encode categorical features using LabelEncoder and OneHotEncoding
heart_df = encode_categorical_features(heart_df, target_col)

Ordinal Categorical Features: []
Nominal Categorical Features: []


In [8]:
# divide a heart failure dataset into features and target value sets
x = heart_df.drop(columns=[target_col])
y = heart_df[target_col]

### Data stratified sampling

In [9]:
# if needed generare and save into CSV files new folds created by stratified data sampling
if generate_new_folds:
    create_stratified_kfolds(x_df=x, y_df=y, dataset=heart_df, n_splits=n_splits)

# Read folds that are available
# Create empty lists to store train and test DataFrames
train_datasets = []
test_datasets = []

for fold_num in range(1, n_splits+1):
    train_file_path = f"folds/fold_{fold_num}_train.csv"
    test_file_path = f"folds/fold_{fold_num}_test.csv"
    
    # Load the train and test fold data into DataFrames
    train_fold = pd.read_csv(train_file_path)
    test_fold = pd.read_csv(test_file_path)
    
    train_datasets.append(train_fold)
    test_datasets.append(test_fold)
print("Folds data were loaded successfully!")

Folds data were loaded successfully!


### Lazy Classifier

In [10]:
folds_results = {}

for fold_num, (train_data, test_data) in enumerate(zip(train_datasets, test_datasets),1):
    # Standardize features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(train_data.drop(target_col, axis=1))
    X_test_scaled = scaler.transform(test_data.drop(target_col, axis=1))

    # Feature Selection using SelectKBest with ANOVA F-statistic
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train_scaled, train_data[target_col])
    X_test_selected = selector.transform(X_test_scaled)

    # fit all models
    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train_selected, X_test_selected, train_data[target_col], test_data[target_col])

 97%|█████████▋| 28/29 [4:39:34<44:44, 2684.84s/it]  

[LightGBM] [Info] Number of positive: 18973, number of negative: 164851
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 183824, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103213 -> initscore=-2.162025
[LightGBM] [Info] Start training from score -2.162025


100%|██████████| 29/29 [4:39:43<00:00, 578.73s/it] 
 97%|█████████▋| 28/29 [2:40:50<24:58, 1498.74s/it]  

[LightGBM] [Info] Number of positive: 18974, number of negative: 164851
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 183825, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103218 -> initscore=-2.161972
[LightGBM] [Info] Start training from score -2.161972


100%|██████████| 29/29 [2:40:52<00:00, 332.83s/it] 
 97%|█████████▋| 28/29 [1:19:14<12:28, 748.57s/it]   

[LightGBM] [Info] Number of positive: 18974, number of negative: 164851
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 183825, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103218 -> initscore=-2.161972
[LightGBM] [Info] Start training from score -2.161972


100%|██████████| 29/29 [1:19:15<00:00, 163.97s/it]
 97%|█████████▋| 28/29 [1:13:14<11:31, 691.67s/it]   

[LightGBM] [Info] Number of positive: 18974, number of negative: 164851
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 183825, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103218 -> initscore=-2.161972
[LightGBM] [Info] Start training from score -2.161972


100%|██████████| 29/29 [1:13:16<00:00, 151.61s/it]
 97%|█████████▋| 28/29 [1:17:37<12:12, 732.93s/it]   

[LightGBM] [Info] Number of positive: 18973, number of negative: 164852
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 183825, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103212 -> initscore=-2.162031
[LightGBM] [Info] Start training from score -2.162031


100%|██████████| 29/29 [1:17:38<00:00, 160.64s/it]


### Models and their results

In [11]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.76,0.72,0.72,0.8,0.16
PassiveAggressiveClassifier,0.72,0.68,0.68,0.77,0.32
GaussianNB,0.84,0.67,0.67,0.85,0.18
BernoulliNB,0.85,0.65,0.65,0.86,0.38
QuadraticDiscriminantAnalysis,0.87,0.63,0.63,0.87,0.22
LinearDiscriminantAnalysis,0.89,0.57,0.57,0.87,0.38
BaggingClassifier,0.88,0.56,0.56,0.86,5.85
KNeighborsClassifier,0.89,0.56,0.56,0.86,18.71
RandomForestClassifier,0.88,0.56,0.56,0.86,17.21
DecisionTreeClassifier,0.88,0.56,0.56,0.86,1.66


In [16]:
import openpyxl
models.to_excel('model_results/lazy_classifier_models_results.xlsx', header=True)