In [21]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from imblearn.over_sampling import SMOTE

## read data

In [22]:
df = pd.read_csv("heart_disease_ubdate.csv")

In [23]:
df

Unnamed: 0.1,Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,...,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status,Age_Group
0,0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,...,High,Medium,7.633228,Medium,342.0,130.666667,12.969246,12.387250,0,50-59
1,1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,...,Medium,High,8.744034,Medium,133.0,157.000000,9.355389,19.298875,0,60-80
2,2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,...,Low,Low,4.440440,Low,393.0,92.000000,12.709873,11.230926,0,40-49
3,3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,...,Low,High,5.249405,High,293.0,94.000000,12.509046,5.961958,0,30-39
4,4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,...,Low,High,7.030971,High,263.0,154.000000,10.381259,8.153887,0,60-80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9976,9995,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,...,Medium,High,6.834954,Medium,343.0,133.000000,3.588814,19.132004,1,18-29
9977,9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,...,Medium,High,8.247784,Low,377.0,83.000000,2.658267,9.715709,1,30-39
9978,9997,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,...,Low,Low,4.436762,Low,248.0,88.000000,4.408867,9.492429,1,60-80
9979,9998,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,...,Medium,High,8.526329,Medium,113.0,153.000000,7.215634,11.873486,1,18-29


## split Data to train  and Test

In [24]:
x=df.drop("Heart Disease Status",axis=1)
y=df["Heart Disease Status"]

In [25]:
x_train,x_test,y_train,y_test = train_test_split(x , y , test_size=0.25)

## convert columns by OneHotEncoder that have (High,Medium,Low)

In [26]:
Encoder = OneHotEncoder(sparse_output=False , drop='first')

In [27]:
Transformed_data = Encoder.fit_transform(x_train[['Exercise Habits',"Alcohol Consumption","Stress Level","Sugar Consumption"]])

In [28]:
Transformed_data_df =pd.DataFrame(Transformed_data , columns= Encoder.get_feature_names_out())

In [29]:
x_train = pd.concat([x_train.reset_index(drop=True),Transformed_data_df] ,axis = 1 )
x_train

Unnamed: 0.1,Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,...,Homocysteine Level,Age_Group,Exercise Habits_Low,Exercise Habits_Medium,Alcohol Consumption_Low,Alcohol Consumption_Medium,Stress Level_Low,Stress Level_Medium,Sugar Consumption_Low,Sugar Consumption_Medium
0,999,22.0,Male,121.0,255.0,Medium,No,Yes,Yes,22.858916,...,7.486413,18-29,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1984,35.0,Female,151.0,184.0,Medium,Yes,No,No,22.660651,...,7.277182,30-39,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,6312,58.0,Male,157.0,154.0,Low,No,No,No,27.971576,...,14.623325,50-59,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,7545,71.0,Male,161.0,181.0,High,Yes,Yes,No,35.222135,...,14.975287,60-80,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,7908,69.0,Female,160.0,259.0,High,No,Yes,Yes,34.691700,...,12.204122,60-80,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7480,1196,43.0,Male,122.0,272.0,Medium,Yes,No,Yes,19.979772,...,9.562651,40-49,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
7481,1433,52.0,Male,160.0,234.0,High,No,No,Yes,22.477391,...,9.200856,50-59,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
7482,374,18.0,Male,124.0,152.0,Medium,Yes,No,No,19.708918,...,17.298724,18-29,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
7483,4763,71.0,Male,167.0,294.0,Low,Yes,Yes,Yes,31.824313,...,6.453844,60-80,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [30]:
Transformed_data_test = Encoder.transform(x_test[['Exercise Habits',"Alcohol Consumption","Stress Level","Sugar Consumption"]])

In [31]:
Transformed_data_df_test =pd.DataFrame(Transformed_data_test , columns= Encoder.get_feature_names_out())

In [32]:
x_test = pd.concat([x_test.reset_index(drop=True),Transformed_data_df_test] ,axis = 1 )
x_test

Unnamed: 0.1,Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,...,Homocysteine Level,Age_Group,Exercise Habits_Low,Exercise Habits_Medium,Alcohol Consumption_Low,Alcohol Consumption_Medium,Stress Level_Low,Stress Level_Medium,Sugar Consumption_Low,Sugar Consumption_Medium
0,4814,68.0,Male,139.0,259.0,Medium,No,Yes,Yes,29.735451,...,8.674226,60-80,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,5400,76.0,Male,131.0,230.0,Low,Yes,Yes,Yes,34.895571,...,15.849593,60-80,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6642,39.0,Male,174.0,296.0,Low,No,No,Yes,25.244592,...,16.831481,30-39,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2381,51.0,Male,133.0,242.0,High,No,No,No,34.647181,...,18.601148,50-59,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,4244,38.0,Female,160.0,257.0,Medium,Yes,Yes,No,20.196991,...,16.654895,30-39,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491,7271,52.0,Female,155.0,298.0,Medium,No,No,Yes,26.123656,...,8.064138,50-59,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2492,6896,26.0,Female,168.0,218.0,High,Yes,Yes,Yes,33.850609,...,16.355608,18-29,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2493,9447,42.0,Female,138.0,211.0,Low,No,Yes,Yes,38.762883,...,7.114929,40-49,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2494,644,43.0,Male,132.0,239.0,Medium,Yes,Yes,No,21.501772,...,14.903768,40-49,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


## convert columns by BinaryEncoder that have (yes,No)

In [33]:
Bencoder = BinaryEncoder()

In [34]:
Transformed__Df = Bencoder.fit_transform(x_train.select_dtypes("O"))

In [35]:
Transformed__Df_test = Bencoder.transform(x_test.select_dtypes("O"))


In [36]:
x_train.drop(['Exercise Habits',"Alcohol Consumption","Stress Level","Sugar Consumption","Age_Group"] , axis = 1 , inplace =True)

In [37]:
x_test.drop(['Exercise Habits',"Alcohol Consumption","Stress Level","Sugar Consumption","Age_Group"] , axis = 1 , inplace =True)

In [38]:
x_train = pd.concat([x_train,Transformed__Df] ,axis = 1 )
x_train

Unnamed: 0.1,Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol_1,Alcohol Consumption_0,Alcohol Consumption_1,Stress Level_0,Stress Level_1,Sugar Consumption_0,Sugar Consumption_1,Age_Group_0,Age_Group_1,Age_Group_2
0,999,22.0,Male,121.0,255.0,No,Yes,Yes,22.858916,Yes,...,1,0,1,0,1,0,1,0,0,1
1,1984,35.0,Female,151.0,184.0,Yes,No,No,22.660651,Yes,...,0,1,0,1,0,1,0,0,1,0
2,6312,58.0,Male,157.0,154.0,No,No,No,27.971576,Yes,...,0,0,1,1,1,1,1,0,1,1
3,7545,71.0,Male,161.0,181.0,Yes,Yes,No,35.222135,Yes,...,1,1,0,1,1,0,1,1,0,0
4,7908,69.0,Female,160.0,259.0,No,Yes,Yes,34.691700,No,...,1,1,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7480,1196,43.0,Male,122.0,272.0,Yes,No,Yes,19.979772,Yes,...,1,0,1,1,0,1,1,1,0,1
7481,1433,52.0,Male,160.0,234.0,No,No,Yes,22.477391,No,...,0,1,1,1,0,0,1,0,1,1
7482,374,18.0,Male,124.0,152.0,Yes,No,No,19.708918,No,...,0,0,1,1,0,1,1,0,0,1
7483,4763,71.0,Male,167.0,294.0,Yes,Yes,Yes,31.824313,Yes,...,0,0,1,1,1,1,1,1,0,0


In [39]:
x_test = pd.concat([x_test,Transformed__Df_test] ,axis = 1 )

In [40]:
x_train.drop(x_train.select_dtypes("O"),inplace=True,axis=1)
x_train

Unnamed: 0.1,Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,...,High LDL Cholesterol_1,Alcohol Consumption_0,Alcohol Consumption_1,Stress Level_0,Stress Level_1,Sugar Consumption_0,Sugar Consumption_1,Age_Group_0,Age_Group_1,Age_Group_2
0,999,22.0,121.0,255.0,22.858916,4.263584,108.0,99.0,5.106093,7.486413,...,1,0,1,0,1,0,1,0,0,1
1,1984,35.0,151.0,184.0,22.660651,4.926982,251.0,138.0,11.470303,7.277182,...,0,1,0,1,0,1,0,0,1,0
2,6312,58.0,157.0,154.0,27.971576,4.377342,345.0,111.0,13.241633,14.623325,...,0,0,1,1,1,1,1,0,1,1
3,7545,71.0,161.0,181.0,35.222135,5.341289,303.0,151.0,7.329599,14.975287,...,1,1,0,1,1,0,1,1,0,0
4,7908,69.0,160.0,259.0,34.691700,7.740001,115.0,142.0,6.044299,12.204122,...,1,1,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7480,1196,43.0,122.0,272.0,19.979772,8.001600,309.0,115.0,5.296096,9.562651,...,1,0,1,1,0,1,1,1,0,1
7481,1433,52.0,160.0,234.0,22.477391,4.316074,245.0,82.0,1.505821,9.200856,...,0,1,1,1,0,0,1,0,1,1
7482,374,18.0,124.0,152.0,19.708918,4.457039,158.0,147.0,11.852771,17.298724,...,0,0,1,1,0,1,1,0,0,1
7483,4763,71.0,167.0,294.0,31.824313,5.825596,363.0,125.0,4.381725,6.453844,...,0,0,1,1,1,1,1,1,0,0


In [41]:
x_test.drop(x_test.select_dtypes("O"),inplace=True,axis=1)

## scaler to numrical columns by StandardScaler 

In [42]:
scaler = StandardScaler()

In [58]:
y_train.value_counts()

Heart Disease Status
0    5972
1    5972
Name: count, dtype: int64

## use SMOTE to make the value of 1 equal 0

In [48]:
sm = SMOTE()
x_train,y_train = sm.fit_resample(x_train,y_train)

In [50]:
x_train[x_train.select_dtypes("number").columns] = scaler.fit_transform(x_train.select_dtypes("number"))

In [51]:
x_test[x_test.select_dtypes("number").columns] = scaler.transform(x_test.select_dtypes("number"))

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
model = LogisticRegression()

model.fit(x_train , y_train)

In [55]:
model.score(x_train , y_train)

0.9979906229068989

In [56]:
model.score(x_test , y_test)

0.9955929487179487

In [57]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, x_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.94139807 0.99748849 0.99330264 0.99748849 0.99748744]
Mean CV score: 0.9854330253191449
