In [215]:
# impoeting needed Libraries 
import numpy as np
import pandas as pd
import plotly.express as px

In [216]:
## Display Max Columns in Pandas
pd.set_option('display.max_columns', None)  


In [217]:
# loading the dataset
df=pd.read_csv('heart_disease.csv')
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,Yes,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.387250,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes
9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes
9997,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes
9998,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes


## Understanding Dataset
#Dataset Description
| Column Name             | Description | Brief Explanation |
|-------------------------|-------------|-------------------|
| Age | The individual's age. | Age affects overall heart disease risk. |
| Gender | The individual's gender (Male or Female). | Risk can vary between males and females. |
| Blood Pressure | The individual's blood pressure (systolic). | Higher systolic pressure increases risk. |
| Cholesterol Level | The individual's total cholesterol level. | High cholesterol contributes to plaque buildup. |
| Exercise Habits | The individual's exercise habits (Low, Medium, High). | More exercise generally reduces risk. |
| Smoking | Whether the individual smokes or not (Yes or No). | Smoking is a major heart disease risk factor. |
| Family Heart Disease | Family history of heart disease (Yes or No). | Genetics can increase risk. |
| Diabetes | Whether the individual has diabetes (Yes or No). | Diabetes significantly raises heart risk. |
| BMI | The individual's body mass index. | Higher BMI may indicate obesity-related risk. |
| High Blood Pressure | Whether the individual has high blood pressure (Yes or No). | Hypertension strains the heart. |
| Low HDL Cholesterol | Whether HDL is low (Yes or No). | Low “good cholesterol” raises risk. |
| High LDL Cholesterol | Whether LDL is high (Yes or No). | High “bad cholesterol” leads to plaque buildup. |
| Alcohol Consumption | Level of alcohol intake (None, Low, Medium, High). | Excessive alcohol increases risk. |
| Stress Level | The individual's stress level (Low, Medium, High). | High stress may negatively affect the heart. |
| Sleep Hours | Number of hours the individual sleeps. | Too little sleep is linked to higher risk. |
| Sugar Consumption | Sugar intake level (Low, Medium, High). | High sugar intake harms heart health. |
| Triglyceride Level | The individual's triglyceride level. | High triglycerides raise heart disease risk. |
| Fasting Blood Sugar | Blood sugar after fasting. | Elevated levels indicate diabetes or prediabetes. |
| CRP Level | C-reactive protein level, a marker of inflammation. | High CRP signals inflammation linked to heart issues. |
| Homocysteine Level | Homocysteine level, affecting blood vessel health. | High levels may damage blood vessels. |
| Heart Disease Status (Target) | Whether the individual has heart disease (Yes or No). | Target variable indicating heart disease presence. |


## Data Exploration

In [218]:
#Chec kData Types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Sleep Hours           9975 non-null   float64
 15  Sugar Consumption   

In [219]:
#Check duplicates
df.duplicated().sum()

np.int64(0)

In [220]:
#Check Missing Values
df.isna().mean().round(4).sort_values(ascending=False) *100

Alcohol Consumption     25.86
Diabetes                 0.30
Sugar Consumption        0.30
Cholesterol Level        0.30
Age                      0.29
Triglyceride Level       0.26
CRP Level                0.26
High LDL Cholesterol     0.26
High Blood Pressure      0.26
Low HDL Cholesterol      0.25
Sleep Hours              0.25
Exercise Habits          0.25
Smoking                  0.25
Fasting Blood Sugar      0.22
BMI                      0.22
Stress Level             0.22
Family Heart Disease     0.21
Homocysteine Level       0.20
Blood Pressure           0.19
Gender                   0.19
Heart Disease Status     0.00
dtype: float64

In [221]:
# Get missing value percentages
missing_values = df.isna().mean().round(4).sort_values(ascending=False) *100

# Get columns with >0% and <5% missing
missing_value_columns = missing_values[(missing_values > 0) & (missing_values <= 5)].index.tolist()
missing_value_columns

['Diabetes',
 'Sugar Consumption',
 'Cholesterol Level',
 'Age',
 'Triglyceride Level',
 'CRP Level',
 'High LDL Cholesterol',
 'High Blood Pressure',
 'Low HDL Cholesterol',
 'Sleep Hours',
 'Exercise Habits',
 'Smoking',
 'Fasting Blood Sugar',
 'BMI',
 'Stress Level',
 'Family Heart Disease',
 'Homocysteine Level',
 'Blood Pressure',
 'Gender']

In [222]:
#Check Numeric Columns 
df.describe().round()

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
count,9971.0,9981.0,9970.0,9978.0,9975.0,9974.0,9978.0,9974.0,9980.0
mean,49.0,150.0,225.0,29.0,7.0,251.0,120.0,7.0,12.0
std,18.0,18.0,44.0,6.0,2.0,87.0,24.0,4.0,4.0
min,18.0,120.0,150.0,18.0,4.0,100.0,80.0,0.0,5.0
25%,34.0,134.0,187.0,24.0,5.0,176.0,99.0,4.0,9.0
50%,49.0,150.0,226.0,29.0,7.0,250.0,120.0,7.0,12.0
75%,65.0,165.0,263.0,35.0,9.0,326.0,141.0,11.0,16.0
max,80.0,180.0,300.0,40.0,10.0,400.0,160.0,15.0,20.0


In [223]:
#Check Object Columns
df.describe(include='object')

Unnamed: 0,Gender,Exercise Habits,Smoking,Family Heart Disease,Diabetes,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sugar Consumption,Heart Disease Status
count,9981,9975,9975,9979,9970,9974,9975,9974,7414,9978,9970,10000
unique,2,3,2,2,2,2,2,2,3,3,3,2
top,Male,High,Yes,No,No,Yes,Yes,No,Medium,Medium,Low,No
freq,5003,3372,5123,5004,5018,5022,5000,5036,2500,3387,3390,8000


In [224]:
#In depth Checking form numirical Columns
num_cols=df.select_dtypes(include='number').columns
num_cols
for col in num_cols:
    px.histogram(data_frame=df,x=col,title=col).show()
    px.box(data_frame=df,x=col).show()

In [225]:
### In depth check for categorical columns
cat_cols = df.select_dtypes(include= 'object').columns

for col in cat_cols:

    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('*' * 100)


Gender
2
['Male' 'Female' nan]
****************************************************************************************************
Exercise Habits
3
['High' 'Low' 'Medium' nan]
****************************************************************************************************
Smoking
2
['Yes' 'No' nan]
****************************************************************************************************
Family Heart Disease
2
['Yes' 'No' nan]
****************************************************************************************************
Diabetes
2
['No' 'Yes' nan]
****************************************************************************************************
High Blood Pressure
2
['Yes' 'No' nan]
****************************************************************************************************
Low HDL Cholesterol
2
['Yes' 'No' nan]
****************************************************************************************************
High LDL Cholesterol
2
['No' 'Yes' nan]
***

In [226]:
#Check Data Percentage after Dropping All Missing Values except Alcohol Consumption
(df.drop(columns='Alcohol Consumption').dropna().shape[0] / df.drop(columns='Alcohol Consumption').shape[0])*100

95.32000000000001

In [227]:
#Dropping Missing Values
df=df.dropna(subset=missing_value_columns,ignore_index=True)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
1,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No
2,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
3,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
4,25.0,Male,152.0,257.0,Low,Yes,No,No,28.144681,No,No,No,Low,Medium,5.504876,Low,126.0,91.0,4.297575,10.815983,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9527,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes
9528,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes
9529,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes
9530,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes


In [228]:
df.isna().sum()

Age                        0
Gender                     0
Blood Pressure             0
Cholesterol Level          0
Exercise Habits            0
Smoking                    0
Family Heart Disease       0
Diabetes                   0
BMI                        0
High Blood Pressure        0
Low HDL Cholesterol        0
High LDL Cholesterol       0
Alcohol Consumption     2465
Stress Level               0
Sleep Hours                0
Sugar Consumption          0
Triglyceride Level         0
Fasting Blood Sugar        0
CRP Level                  0
Homocysteine Level         0
Heart Disease Status       0
dtype: int64

### Data Preprocessing For Machine Learning


In [229]:
## 1 - Split Datafram into input & output
x=df.drop('Heart Disease Status',axis=1) # drop target column
y=df['Heart Disease Status'] #==> target Column

In [230]:
## 2 - Split Datafram into Train & Test
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)



In [231]:
x['Alcohol Consumption'].unique()

array(['Medium', 'Low', nan, 'High'], dtype=object)

# 3) Handleing Categorical

In [232]:
# A) Imputing Missing Data For Categorical Column 'Alcohol Consumption'
x['Alcohol Consumption'].value_counts(normalize= True) * 100

Alcohol Consumption
Medium    33.833310
Low       33.550304
High      32.616386
Name: proportion, dtype: float64

In [233]:
from sklearn.impute import SimpleImputer

simp_imputer = SimpleImputer(strategy= 'constant', fill_value= 'Most Likely Never')

x_train[['Alcohol Consumption']] = simp_imputer.fit_transform(x_train[['Alcohol Consumption']])
x_test[['Alcohol Consumption']] = simp_imputer.transform(x_test[['Alcohol Consumption']])



In [234]:
x_train['Alcohol Consumption'].value_counts(normalize= True) * 100

Alcohol Consumption
Most Likely Never    25.554556
Medium               25.119904
Low                  24.955036
High                 24.370504
Name: proportion, dtype: float64

In [235]:
# b) Encoding Ordinal & Nominal Columns
x_train.select_dtypes(include='object')

Unnamed: 0,Gender,Exercise Habits,Smoking,Family Heart Disease,Diabetes,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sugar Consumption
8708,Female,Medium,No,No,Yes,Yes,Yes,No,Most Likely Never,Low,Low
9465,Female,Low,No,Yes,Yes,Yes,No,Yes,Medium,High,Low
3973,Female,Low,No,Yes,No,No,No,No,Medium,Medium,Low
471,Female,Low,No,No,Yes,No,No,No,Most Likely Never,High,Low
1905,Male,High,Yes,Yes,Yes,Yes,No,No,High,Low,High
...,...,...,...,...,...,...,...,...,...,...,...
5734,Female,High,Yes,Yes,Yes,Yes,No,No,Medium,High,High
5191,Female,Medium,Yes,No,No,Yes,No,No,Most Likely Never,Low,High
5390,Male,High,No,No,Yes,No,Yes,Yes,High,High,Low
860,Female,Medium,Yes,No,No,No,Yes,Yes,High,High,High


In [236]:
# b-1) Encoding Ordinal Columns
Ordinal_col=['Exercise Habits','Stress Level','Sugar Consumption']
Alcohol_Consumption_rank=['Most Likely Never','Low','Medium','High']
other_rank=['Low','Medium','High']


In [237]:
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(categories= [ ['Low','Medium','High'] ])
x_train[['Sugar Consumption']] = ord_encoder.fit_transform(x_train[['Sugar Consumption']])
x_test[['Sugar Consumption']] = ord_encoder.transform(x_test[['Sugar Consumption']])

In [238]:
ord_encoder = OrdinalEncoder(categories= [ ['Low','Medium','High'] ])
x_train[['Exercise Habits']] = ord_encoder.fit_transform(x_train[['Exercise Habits']])
x_test[['Exercise Habits']] = ord_encoder.transform(x_test[['Exercise Habits']])

In [239]:
ord_encoder = OrdinalEncoder(categories= [ ['Low','Medium','High'] ])
x_train[['Stress Level']] = ord_encoder.fit_transform(x_train[['Stress Level']])
x_test[['Stress Level']] = ord_encoder.transform(x_test[['Stress Level']])

In [240]:
Alcohol_ord_encoder = OrdinalEncoder(categories= [ ['Most Likely Never','Low','Medium','High'] ])
x_train[['Alcohol Consumption']] = Alcohol_ord_encoder.fit_transform(x_train[['Alcohol Consumption']])
x_test[['Alcohol Consumption']] = Alcohol_ord_encoder.transform(x_test[['Alcohol Consumption']])


In [241]:
x_train.select_dtypes(include='object')

Unnamed: 0,Gender,Smoking,Family Heart Disease,Diabetes,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol
8708,Female,No,No,Yes,Yes,Yes,No
9465,Female,No,Yes,Yes,Yes,No,Yes
3973,Female,No,Yes,No,No,No,No
471,Female,No,No,Yes,No,No,No
1905,Male,Yes,Yes,Yes,Yes,No,No
...,...,...,...,...,...,...,...
5734,Female,Yes,Yes,Yes,Yes,No,No
5191,Female,Yes,No,No,Yes,No,No
5390,Male,No,No,Yes,No,Yes,Yes
860,Female,Yes,No,No,No,Yes,Yes


In [242]:
nominal_col=x_train.select_dtypes(include='object').columns.unique()

In [243]:
for col in nominal_col:
    print(col)
    print(x_train[col].nunique())
    print('xxx' *100)

Gender
2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Smoking
2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Family Heart Disease
2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Diabetes
2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [244]:
# b-2) Encoding Nominal Columns (OneHotEncoder))

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop= 'first', sparse_output= False)

ohe_arr_train = ohe.fit_transform(x_train[nominal_col])
ohe_arr_test = ohe.transform(x_test[nominal_col])




In [245]:
# Create new Data Frame With The New Encoded Columns
ohe_df_train = pd.DataFrame(ohe_arr_train, columns =ohe.get_feature_names_out())
ohe_df_test = pd.DataFrame(ohe_arr_test, columns =ohe.get_feature_names_out())

In [246]:
ohe_df_train

Unnamed: 0,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
6667,0.0,1.0,1.0,1.0,1.0,0.0,0.0
6668,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6669,1.0,0.0,0.0,1.0,0.0,1.0,1.0
6670,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [247]:
ohe_df_test

Unnamed: 0,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,0.0,1.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...
2855,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2856,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2857,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2858,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [248]:
# Reset index before concatenation
x_train.reset_index(drop= True, inplace= True)
x_test.reset_index(drop= True, inplace= True)

y_train.reset_index(drop= True, inplace= True)
y_test.reset_index(drop= True, inplace= True)

In [249]:
ohe_df_train

Unnamed: 0,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
6667,0.0,1.0,1.0,1.0,1.0,0.0,0.0
6668,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6669,1.0,0.0,0.0,1.0,0.0,1.0,1.0
6670,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [250]:
# Concatinate Datafram with the new Genetaed Encoded Datafram and Drop the Original uncoded Columns
x_train = pd.concat([x_train, ohe_df_train], axis= 1).drop(columns= nominal_col, axis= 1)
x_test = pd.concat([x_test, ohe_df_test], axis= 1).drop(columns= nominal_col, axis= 1)

In [251]:
x_train

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,Exercise Habits,BMI,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,43.0,158.0,245.0,1.0,31.765836,0.0,0.0,4.262421,0.0,247.0,139.0,4.878277,10.848756,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,51.0,171.0,254.0,0.0,38.561680,2.0,2.0,8.926926,0.0,310.0,81.0,4.674548,9.993021,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,62.0,125.0,290.0,0.0,29.026949,2.0,1.0,7.910694,0.0,304.0,157.0,9.737307,11.524870,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,41.0,149.0,173.0,0.0,25.356615,0.0,2.0,6.807908,0.0,346.0,136.0,3.476395,10.529145,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,67.0,122.0,228.0,2.0,30.945695,3.0,0.0,4.023089,2.0,185.0,155.0,11.817875,15.564262,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6667,72.0,138.0,265.0,2.0,37.811559,2.0,2.0,5.366425,2.0,296.0,132.0,0.031310,17.168505,0.0,1.0,1.0,1.0,1.0,0.0,0.0
6668,55.0,157.0,280.0,1.0,38.844989,0.0,0.0,7.968552,2.0,344.0,89.0,11.421681,11.572423,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6669,59.0,173.0,178.0,2.0,30.201835,3.0,2.0,7.625772,0.0,324.0,132.0,7.694307,13.499100,1.0,0.0,0.0,1.0,0.0,1.0,1.0
6670,62.0,151.0,160.0,1.0,31.999049,3.0,2.0,6.700174,2.0,229.0,103.0,6.672989,10.689182,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [252]:
x_test

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,Exercise Habits,BMI,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,48.0,158.0,240.0,2.0,20.660680,0.0,1.0,7.658376,0.0,167.0,135.0,10.864235,14.586437,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,73.0,176.0,193.0,2.0,31.755391,2.0,1.0,9.712228,2.0,193.0,153.0,10.797849,6.290807,0.0,1.0,1.0,1.0,1.0,0.0,1.0
2,28.0,166.0,182.0,0.0,23.213594,3.0,2.0,4.084762,0.0,289.0,104.0,13.130790,14.275993,1.0,1.0,1.0,1.0,1.0,0.0,0.0
3,68.0,149.0,254.0,1.0,30.851702,2.0,0.0,8.402978,2.0,256.0,104.0,8.791903,19.194641,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,72.0,139.0,150.0,1.0,18.601993,2.0,1.0,7.115290,0.0,152.0,110.0,0.937045,5.645286,0.0,1.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2855,70.0,166.0,237.0,1.0,18.961785,2.0,2.0,5.173738,0.0,151.0,118.0,6.575023,9.898541,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2856,69.0,158.0,177.0,2.0,27.725521,0.0,2.0,4.831849,2.0,232.0,93.0,7.736347,10.480979,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2857,30.0,155.0,178.0,0.0,25.598871,1.0,1.0,5.045128,2.0,386.0,102.0,13.747258,12.141334,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2858,65.0,131.0,294.0,1.0,22.220795,1.0,2.0,8.679869,0.0,155.0,87.0,7.890579,17.630414,1.0,0.0,0.0,1.0,0.0,1.0,1.0


# 4) Handleing Numerical Columns

In [253]:
# a) Feature Scaling After Imupting Missing Values if Any

num_cols = x.select_dtypes(include= 'number').columns
num_cols

Index(['Age', 'Blood Pressure', 'Cholesterol Level', 'BMI', 'Sleep Hours',
       'Triglyceride Level', 'Fasting Blood Sugar', 'CRP Level',
       'Homocysteine Level'],
      dtype='object')

In [None]:
# Data is Normally Destributed So No need To Use Robut Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
x_train[num_cols] = min_max_scaler.fit_transform(x_train[num_cols])
x_test[num_cols]   = min_max_scaler.transform(x_test[num_cols])

In [255]:
x_train

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,Exercise Habits,BMI,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,0.403226,0.633333,0.633333,1.0,0.625686,0.0,0.0,0.043646,0.0,0.490000,0.7375,0.325118,0.389932,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0.532258,0.850000,0.693333,0.0,0.934730,2.0,2.0,0.821252,0.0,0.700000,0.0125,0.311530,0.332879,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,0.709677,0.083333,0.933333,0.0,0.501134,2.0,1.0,0.651839,0.0,0.680000,0.9625,0.649195,0.435010,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.370968,0.483333,0.153333,0.0,0.334224,0.0,2.0,0.467997,0.0,0.820000,0.7000,0.231618,0.368623,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.790323,0.033333,0.520000,2.0,0.588390,3.0,0.0,0.003748,2.0,0.283333,0.9375,0.787960,0.704325,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6667,0.870968,0.300000,0.766667,2.0,0.900618,2.0,2.0,0.227692,2.0,0.653333,0.6500,0.001845,0.811283,0.0,1.0,1.0,1.0,1.0,0.0,0.0
6668,0.596774,0.616667,0.866667,1.0,0.947614,0.0,0.0,0.661485,2.0,0.813333,0.1125,0.761535,0.438181,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6669,0.661290,0.883333,0.186667,2.0,0.554563,3.0,2.0,0.604341,0.0,0.746667,0.6500,0.512935,0.566636,1.0,0.0,0.0,1.0,0.0,1.0,1.0
6670,0.709677,0.516667,0.066667,1.0,0.636292,3.0,2.0,0.450037,2.0,0.430000,0.2875,0.444817,0.379293,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [256]:
x_test

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,Exercise Habits,BMI,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gender_Male,Smoking_Yes,Family Heart Disease_Yes,Diabetes_Yes,High Blood Pressure_Yes,Low HDL Cholesterol_Yes,High LDL Cholesterol_Yes
0,0.483871,0.633333,0.600000,2.0,0.120675,0.0,1.0,0.609776,0.0,0.223333,0.6875,0.724356,0.639131,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.887097,0.933333,0.286667,2.0,0.625211,2.0,1.0,0.952168,2.0,0.310000,0.9125,0.719928,0.086045,0.0,1.0,1.0,1.0,1.0,0.0,1.0
2,0.161290,0.766667,0.213333,0.0,0.236769,3.0,2.0,0.014030,0.0,0.630000,0.3000,0.875526,0.618433,1.0,1.0,1.0,1.0,1.0,0.0,0.0
3,0.806452,0.483333,0.693333,1.0,0.584116,2.0,0.0,0.733907,2.0,0.520000,0.3000,0.586140,0.946369,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,0.870968,0.316667,0.000000,1.0,0.027055,2.0,1.0,0.519240,0.0,0.173333,0.3750,0.062254,0.043007,0.0,1.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2855,0.838710,0.766667,0.580000,1.0,0.043417,2.0,2.0,0.195570,0.0,0.170000,0.4750,0.438283,0.326580,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2856,0.822581,0.633333,0.180000,2.0,0.441951,0.0,2.0,0.138574,2.0,0.440000,0.1625,0.515739,0.365412,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2857,0.193548,0.583333,0.186667,0.0,0.345241,1.0,1.0,0.174129,2.0,0.953333,0.2750,0.916642,0.476111,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2858,0.758065,0.183333,0.960000,1.0,0.191621,1.0,2.0,0.780066,0.0,0.183333,0.0875,0.526026,0.842079,1.0,0.0,0.0,1.0,0.0,1.0,1.0


## 5) Handle Imbalance For Target Column 'Heart Disease Status'

In [260]:
y.value_counts(normalize= True) * 100

Heart Disease Status
No     79.951742
Yes    20.048258
Name: proportion, dtype: float64

In [261]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)


In [263]:
#When we check y after sample we will find that Yes & No are the same
y_train_smote.value_counts()

Heart Disease Status
Yes    5338
No     5338
Name: count, dtype: int64