## Data Preprocessing

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import ClusterCentroids

In [2]:
df = pd.read_csv("heart_disease.csv")  

In [3]:
df.head(10)

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
5,25.0,Male,152.0,257.0,Low,Yes,No,No,28.144681,No,...,No,Low,Medium,5.504876,Low,126.0,91.0,4.297575,10.815983,No
6,78.0,Female,121.0,175.0,High,Yes,Yes,Yes,18.042332,No,...,No,Medium,Medium,9.240911,Medium,107.0,85.0,11.582983,19.659461,No
7,38.0,Female,161.0,187.0,Low,Yes,Yes,Yes,34.736683,No,...,No,Low,Medium,7.841008,High,228.0,111.0,4.929381,17.146599,No
8,56.0,Female,135.0,291.0,Low,No,Yes,Yes,34.493112,Yes,...,Yes,High,Low,6.941403,High,317.0,103.0,5.119015,6.051129,No
9,75.0,Male,144.0,252.0,Low,Yes,Yes,No,30.142149,No,...,Yes,Low,Medium,4.002662,High,199.0,96.0,10.005698,7.604357,No


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Sleep Hours           9975 non-null   float64
 15  Sugar Consumption   

e) The notebook must contain minimim 5 or more classification algorithms without optimization, and 5 algorithms with CV optimization / grid search, and the results of all model performances in a table on the notebook. Analyze the best model: Top model without optimization, Top model with optimization.

## 1 Handle missing values 

In [5]:
df.isnull().sum().sort_values(ascending=False)

Alcohol Consumption     2586
Cholesterol Level         30
Sugar Consumption         30
Diabetes                  30
Age                       29
High LDL Cholesterol      26
CRP Level                 26
Triglyceride Level        26
High Blood Pressure       26
Sleep Hours               25
Low HDL Cholesterol       25
Smoking                   25
Exercise Habits           25
BMI                       22
Stress Level              22
Fasting Blood Sugar       22
Family Heart Disease      21
Homocysteine Level        20
Gender                    19
Blood Pressure            19
Heart Disease Status       0
dtype: int64

### handle missing values for categorical features (add indicator column + replace Nan Values with "missing")

In [6]:
X = df.drop("Heart Disease Status", axis=1)
y = df["Heart Disease Status"]

In [7]:
numeric_features = X.select_dtypes(include=[ "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

In [8]:
for col in categorical_features:
    df[col + "_missing"] = df[col].isna().astype(int)

In [9]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df[categorical_features] = df[categorical_features].fillna("missing")

In [11]:
df[df["Smoking_missing"] == 1]

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
281,20.0,Male,141.0,225.0,High,missing,Yes,No,38.378099,Yes,...,0,1,0,0,0,0,0,0,0,0
458,50.0,Female,143.0,220.0,Low,missing,Yes,No,19.659608,No,...,0,1,0,0,0,0,0,1,0,0
560,80.0,Female,155.0,254.0,High,missing,No,No,34.513431,No,...,0,1,0,0,0,0,0,0,0,0
593,57.0,Male,167.0,258.0,Medium,missing,No,Yes,38.042318,Yes,...,0,1,0,0,0,0,0,0,0,0
1153,74.0,Female,126.0,224.0,High,missing,No,No,27.830421,Yes,...,0,1,0,0,0,0,0,1,0,0
2195,26.0,Female,142.0,220.0,High,missing,Yes,Yes,36.511539,Yes,...,0,1,0,0,0,0,0,1,0,0
2833,59.0,Male,150.0,178.0,Low,missing,No,Yes,37.702723,Yes,...,0,1,0,0,0,0,0,0,0,0
2919,79.0,Female,150.0,244.0,Low,missing,Yes,Yes,37.462915,Yes,...,0,1,0,0,0,0,0,1,0,0
3409,34.0,Male,143.0,177.0,Low,missing,Yes,Yes,33.29612,Yes,...,0,1,0,0,0,0,0,0,0,0
3998,40.0,Female,132.0,268.0,High,missing,No,No,23.989152,Yes,...,0,1,0,0,0,0,0,0,0,0


In [12]:
df[df["Alcohol Consumption_missing"] == 1]

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
11,40.0,Female,134.0,296.0,High,No,Yes,Yes,31.739622,No,...,0,0,0,0,0,0,0,1,0,0
19,20.0,Female,133.0,187.0,High,Yes,No,Yes,37.832158,Yes,...,0,0,0,0,0,0,0,1,0,0
23,41.0,Female,170.0,300.0,Low,No,No,No,20.661064,No,...,0,0,0,0,0,0,0,1,0,0
31,29.0,Female,163.0,278.0,Low,Yes,Yes,Yes,30.175866,Yes,...,0,0,0,0,0,0,0,1,0,0
37,66.0,Female,129.0,253.0,Low,No,Yes,No,18.139130,No,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9982,27.0,Female,138.0,212.0,Low,Yes,Yes,No,38.623610,No,...,0,0,0,0,0,0,0,1,0,0
9991,41.0,Female,160.0,241.0,Medium,No,Yes,Yes,39.346342,Yes,...,0,0,0,0,0,0,0,1,0,0
9993,27.0,Female,153.0,188.0,Medium,No,No,Yes,28.173059,Yes,...,0,0,0,0,0,0,0,1,0,0
9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0


### Handle missing values for numerical features (Impute with the mean)

In [13]:
imputer = SimpleImputer(strategy="mean")

In [14]:
df[numeric_features] = imputer.fit_transform(df[numeric_features])

In [15]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


### Encode categorical variables 

In [16]:
df["Gender"] = np.where(df["Gender"] == "missing", -1, df["Gender"])

In [17]:
df[df["Gender_missing"] == 1].head()

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
424,50.0,-1,147.0,264.0,Low,Yes,No,No,20.738247,No,...,0,0,0,0,0,0,0,0,0,0
1550,43.0,-1,152.0,187.0,Low,No,Yes,No,33.284306,Yes,...,0,0,0,0,0,0,0,0,0,0
1770,24.0,-1,150.0,289.0,Medium,No,No,Yes,37.468353,Yes,...,0,0,0,0,0,0,0,0,0,0
1956,54.0,-1,176.0,171.0,Low,Yes,Yes,No,34.130846,No,...,0,0,0,0,0,0,0,1,0,0
2056,62.0,-1,134.0,195.0,Medium,Yes,Yes,No,20.288229,No,...,0,0,0,0,0,0,0,1,0,0


In [18]:
encoder = OrdinalEncoder(categories=[["Female", "Male"]])

In [19]:
mask = df["Gender"] != -1

In [20]:
df.loc[mask, "Gender"] = encoder.fit_transform(df.loc[mask, ["Gender"]])

In [21]:
df["Gender"] = df["Gender"].astype(int)

In [22]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,Low,No,No,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,Medium,No,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,High,Yes,No,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df["Exercise Habits"].unique()

array(['High', 'Low', 'Medium', 'missing'], dtype=object)

In [24]:
df["Exercise Habits"] = np.where(df["Exercise Habits"] == "missing", -1, df["Exercise Habits"])
encoder = OrdinalEncoder(categories=[["Low", "Medium","High"]])
mask = df["Exercise Habits"] != -1
df.loc[mask, "Exercise Habits"] = encoder.fit_transform(df.loc[mask, ["Exercise Habits"]])
df["Exercise Habits"] = df["Exercise Habits"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,Yes,Yes,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,No,Yes,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,No,No,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,Yes,Yes,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,Yes,Yes,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,Yes,No,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,No,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,Yes,No,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,Yes,No,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df["Smoking"] = np.where(df["Smoking"] == "missing", -1, df["Smoking"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["Smoking"] != -1
df.loc[mask, "Smoking"] = encoder.fit_transform(df.loc[mask, ["Smoking"]])
df["Smoking"] = df["Smoking"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,Yes,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,Yes,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,No,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,Yes,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,Yes,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,1,No,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,0,No,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,1,No,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,1,No,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df["Family Heart Disease"].unique()

array(['Yes', 'No', 'missing'], dtype=object)

In [27]:
df["Family Heart Disease"] = np.where(df["Family Heart Disease"] == "missing", -1, df["Family Heart Disease"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["Family Heart Disease"] != -1
df.loc[mask, "Family Heart Disease"] = encoder.fit_transform(df.loc[mask, ["Family Heart Disease"]])
df["Family Heart Disease"] = df["Family Heart Disease"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,1,No,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,1,Yes,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,0,No,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,1,No,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,1,Yes,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,1,0,No,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,0,0,No,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,1,0,Yes,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,1,0,Yes,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df["Diabetes"].unique()

array(['No', 'Yes', 'missing'], dtype=object)

In [29]:
df["Diabetes"] = np.where(df["Diabetes"] == "missing", -1, df["Diabetes"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["Diabetes"] != -1
df.loc[mask, "Diabetes"] = encoder.fit_transform(df.loc[mask, ["Diabetes"]])
df["Diabetes"] = df["Diabetes"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,1,0,24.991591,Yes,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,1,1,25.221799,No,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,0,0,29.855447,No,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,1,0,24.130477,Yes,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,1,1,20.486289,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,1,0,0,18.788791,Yes,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,0,0,0,31.856801,Yes,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,1,0,1,26.899911,No,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,1,0,1,34.964026,Yes,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df["High Blood Pressure"].unique()

array(['Yes', 'No', 'missing'], dtype=object)

In [31]:
df["High Blood Pressure"] = np.where(df["High Blood Pressure"] == "missing", -1, df["High Blood Pressure"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["High Blood Pressure"] != -1
df.loc[mask, "High Blood Pressure"] = encoder.fit_transform(df.loc[mask, ["High Blood Pressure"]])
df["High Blood Pressure"] = df["High Blood Pressure"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,1,0,24.991591,1,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,1,1,25.221799,0,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,0,0,29.855447,0,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,1,0,24.130477,1,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,1,1,20.486289,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,1,0,0,18.788791,1,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,0,0,0,31.856801,1,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,1,0,1,26.899911,0,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,1,0,1,34.964026,1,...,0,0,0,0,0,0,0,0,0,0


In [32]:
df["Low HDL Cholesterol"].unique()

array(['Yes', 'No', 'missing'], dtype=object)

In [33]:
df["Low HDL Cholesterol"] = np.where(df["Low HDL Cholesterol"] == "missing", -1, df["Low HDL Cholesterol"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["Low HDL Cholesterol"] != -1
df.loc[mask, "Low HDL Cholesterol"] = encoder.fit_transform(df.loc[mask, ["Low HDL Cholesterol"]])
df["Low HDL Cholesterol"] = df["Low HDL Cholesterol"].astype(int)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,1,0,24.991591,1,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,1,1,25.221799,0,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,0,0,29.855447,0,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,1,0,24.130477,1,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,1,1,20.486289,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,0,136.0,243.0,1,1,0,0,18.788791,1,...,0,0,0,0,0,0,0,0,0,0
9996,38.0,1,172.0,154.0,1,0,0,0,31.856801,1,...,0,0,0,0,0,0,0,1,0,0
9997,73.0,1,152.0,201.0,2,1,0,1,26.899911,0,...,0,0,0,0,0,0,0,1,0,0
9998,23.0,1,142.0,299.0,0,1,0,1,34.964026,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df["High LDL Cholesterol"].unique()

array(['No', 'Yes', 'missing'], dtype=object)

In [35]:
df["High LDL Cholesterol"] = np.where(df["High LDL Cholesterol"] == "missing", -1, df["High LDL Cholesterol"])
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
mask = df["High LDL Cholesterol"] != -1
df.loc[mask, "High LDL Cholesterol"] = encoder.fit_transform(df.loc[mask, ["High LDL Cholesterol"]])
df["High LDL Cholesterol"] = df["High LDL Cholesterol"].astype(int)

In [36]:
df["Alcohol Consumption"].unique()

array(['High', 'Medium', 'Low', 'missing'], dtype=object)

In [37]:
df["Alcohol Consumption"] = np.where(df["Alcohol Consumption"] == "missing", -1, df["Alcohol Consumption"])
encoder = OrdinalEncoder(categories=[["Low", "Medium","High"]])
mask = df["Alcohol Consumption"] != -1
df.loc[mask, "Alcohol Consumption"] = encoder.fit_transform(df.loc[mask, ["Alcohol Consumption"]])
df["Alcohol Consumption"] = df["Alcohol Consumption"].astype(int)

In [38]:
df["Alcohol Consumption"]

0       2
1       1
2       0
3       0
4       0
       ..
9995    1
9996   -1
9997   -1
9998    1
9999    2
Name: Alcohol Consumption, Length: 10000, dtype: int64

In [39]:
df["Stress Level"].unique()

array(['Medium', 'High', 'Low', 'missing'], dtype=object)

In [40]:
df["Stress Level"] = np.where(df["Stress Level"] == "missing", -1, df["Stress Level"])
encoder = OrdinalEncoder(categories=[["Low", "Medium","High"]])
mask = df["Stress Level"] != -1
df.loc[mask, "Stress Level"] = encoder.fit_transform(df.loc[mask, ["Stress Level"]])
df["Stress Level"] = df["Stress Level"].astype(int)

In [41]:
df["Sugar Consumption"].unique()

array(['Medium', 'Low', 'High', 'missing'], dtype=object)

In [42]:
df["Sugar Consumption"] = np.where(df["Sugar Consumption"] == "missing", -1, df["Sugar Consumption"])
encoder = OrdinalEncoder(categories=[["Low", "Medium","High"]])
mask = df["Sugar Consumption"] != -1
df.loc[mask, "Sugar Consumption"] = encoder.fit_transform(df.loc[mask, ["Sugar Consumption"]])
df["Sugar Consumption"] = df["Sugar Consumption"].astype(int)

In [43]:
df["Heart Disease Status"]

0        No
1        No
2        No
3        No
4        No
       ... 
9995    Yes
9996    Yes
9997    Yes
9998    Yes
9999    Yes
Name: Heart Disease Status, Length: 10000, dtype: object

In [44]:
df["Heart Disease Status"].unique()

array(['No', 'Yes'], dtype=object)

In [45]:
encoder = OrdinalEncoder(categories=[["No", "Yes"]])
df["Heart Disease Status"] = encoder.fit_transform(df[["Heart Disease Status"]])

In [46]:
df["Heart Disease Status"].unique()

array([0., 1.])

In [47]:
df.head()

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,56.0,1,153.0,155.0,2,1,1,0,24.991591,1,...,0,0,0,0,0,0,0,0,0,0
1,69.0,0,146.0,286.0,2,0,1,1,25.221799,0,...,0,0,0,0,0,0,0,0,0,0
2,46.0,1,126.0,216.0,0,0,0,0,29.855447,0,...,0,0,0,0,0,0,0,0,0,0
3,32.0,0,122.0,293.0,2,1,1,0,24.130477,1,...,0,0,0,0,0,0,0,0,0,0
4,60.0,1,166.0,242.0,0,1,1,1,20.486289,1,...,0,0,0,0,0,0,0,0,0,0


In [48]:
scaler = MinMaxScaler()
scaler.fit(df[numeric_features])
df[numeric_features] = scaler.transform(df[numeric_features])

In [49]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,Exercise Habits_missing,Smoking_missing,Family Heart Disease_missing,Diabetes_missing,High Blood Pressure_missing,Low HDL Cholesterol_missing,High LDL Cholesterol_missing,Alcohol Consumption_missing,Stress Level_missing,Sugar Consumption_missing
0,0.612903,1,0.550000,0.033333,2,1,1,0,0.317756,1,...,0,0,0,0,0,0,0,0,0,0
1,0.822581,0,0.433333,0.906667,2,0,1,1,0.328222,0,...,0,0,0,0,0,0,0,0,0,0
2,0.451613,1,0.100000,0.440000,0,0,0,0,0.538899,0,...,0,0,0,0,0,0,0,0,0,0
3,0.225806,0,0.033333,0.953333,2,1,1,0,0.278604,1,...,0,0,0,0,0,0,0,0,0,0
4,0.677419,1,0.766667,0.613333,0,1,1,1,0.112914,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.112903,0,0.266667,0.620000,1,1,0,0,0.035735,1,...,0,0,0,0,0,0,0,0,0,0
9996,0.322581,1,0.866667,0.026667,1,0,0,0,0.629894,1,...,0,0,0,0,0,0,0,1,0,0
9997,0.887097,1,0.533333,0.340000,2,1,0,1,0.404521,0,...,0,0,0,0,0,0,0,1,0,0
9998,0.080645,1,0.366667,0.993333,0,1,0,1,0.771169,1,...,0,0,0,0,0,0,0,0,0,0


In [50]:
X = df.drop("Heart Disease Status", axis=1)
y = df["Heart Disease Status"]

## First Experimentations (Try 5 classifiers)

## data splitiing 

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y        
)

## Logisitic Regression

In [52]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1600
         1.0       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random Forest

In [53]:
model = RandomForestClassifier(
    n_estimators=100,         max_depth=None,      
    random_state=42
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1600
         1.0       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## KNN

In [54]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = KNeighborsClassifier(
    n_neighbors=5,       
    metric='minkowski',  
    p=2                  
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.756

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      0.93      0.86      1600
         1.0       0.19      0.07      0.10       400

    accuracy                           0.76      2000
   macro avg       0.49      0.50      0.48      2000
weighted avg       0.68      0.76      0.71      2000



In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y        
)

## XGBoost 

In [56]:
model = xgb.XGBClassifier(
    n_estimators=200,      
    learning_rate=0.1,    
    max_depth=6,           
    subsample=0.8,         
    colsample_bytree=0.8,  
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.793

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      0.99      0.88      1600
         1.0       0.15      0.01      0.01       400

    accuracy                           0.79      2000
   macro avg       0.47      0.50      0.45      2000
weighted avg       0.67      0.79      0.71      2000



## SVM

In [57]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = SVC(
    kernel='rbf',        
    C=1.0,             
    gamma='scale',       
    probability=False,  
    random_state=42
)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1600
         1.0       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## MLP (neural network)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y        
)

In [60]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = MLPClassifier(
    hidden_layer_sizes=(100,), 
    activation='relu',          
    solver='adam',             
    max_iter=1000,            
    random_state=42
)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7315

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      0.88      0.84      1600
         1.0       0.21      0.13      0.16       400

    accuracy                           0.73      2000
   macro avg       0.51      0.51      0.50      2000
weighted avg       0.68      0.73      0.70      2000



## Second Experimentations

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



In [63]:
rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)


grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV Accuracy: 0.8

Test Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1600
         1.0       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


mlp = MLPClassifier(max_iter=500, random_state=42)


param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],  # 1 or 2 hidden layers
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': [0.001, 0.01, 0.1]
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)


grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.001, 'solver': 'sgd'}
Best CV Accuracy: 0.8

Test Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1600
         1.0       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV] END activation=relu, hidden_layer_sizes=(50,), learning_rate_init=0.001, solver=adam; total time=  10.5s
[CV] END activation=relu, hidden_layer_sizes=(100,), learning_rate_init=0.01, solver=adam; total time=   5.2s
[CV] END activation=relu, hidden_layer_sizes=(100,), learning_rate_init=0.1, solver=adam; total time=  17.5s
[CV] END activation=relu, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, solver=sgd; total time=  32.3s
[CV] END activation=tanh, hidden_layer_sizes=(50,), learning_rate_init=0.01, solver=sgd; total time=  28.3s
[CV] END activation=tanh, hidden_layer_sizes=(100,), learning_rate_init=0.01, solver=sgd; total time=  47.4s
[CV] END activation=relu, hidden_layer_sizes=(50,), learning_rate_init=0.01, solver=adam; total time=   2.6s
[CV] END activation=relu, hidden_layer_sizes=(50,), learning_rate_init=0.1, solver=adam; total time=   1.6s
[CV] END activation=relu, hidden_layer_sizes=(50,), learning_rate_init=0.1, solver=sgd; total time=   2.8s
[CV] END activatio

## Downsampling

In [68]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Original training class distribution:", dict(zip(*np.unique(y_train, return_counts=True))))

# . ClusterCentroids undersampling
cc = ClusterCentroids(random_state=42)
X_res, y_res = cc.fit_resample(X_train, y_train)

print("After ClusterCentroids:", dict(zip(*np.unique(y_res, return_counts=True))))


scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.transform(X_test)


model = RandomForestClassifier(random_state=42)
model.fit(X_res_scaled, y_res)


y_pred = model.predict(X_test_scaled)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Original training class distribution: {0.0: 6400, 1.0: 1600}
After ClusterCentroids: {0.0: 1600, 1.0: 1600}

Test Accuracy: 0.2655

Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.11      0.19      1600
         1.0       0.20      0.90      0.33       400

    accuracy                           0.27      2000
   macro avg       0.51      0.50      0.26      2000
weighted avg       0.69      0.27      0.22      2000



In [71]:
y_res

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3195    1.0
3196    1.0
3197    1.0
3198    1.0
3199    1.0
Name: Heart Disease Status, Length: 3200, dtype: float64