In [1]:
# importing needed Libraries 
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
## Display Max Columns in Pandas
pd.set_option('display.max_columns', None)  


In [3]:
# loading the dataset
df=pd.read_csv('heart_disease.csv')
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,Yes,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.387250,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes
9996,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes
9997,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes
9998,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes


## Understanding Dataset
#Dataset Description
| Column Name             | Description | Brief Explanation |
|-------------------------|-------------|-------------------|
| Age | The individual's age. | Age affects overall heart disease risk. |
| Gender | The individual's gender (Male or Female). | Risk can vary between males and females. |
| Blood Pressure | The individual's blood pressure (systolic). | Higher systolic pressure increases risk. |
| Cholesterol Level | The individual's total cholesterol level. | High cholesterol contributes to plaque buildup. |
| Exercise Habits | The individual's exercise habits (Low, Medium, High). | More exercise generally reduces risk. |
| Smoking | Whether the individual smokes or not (Yes or No). | Smoking is a major heart disease risk factor. |
| Family Heart Disease | Family history of heart disease (Yes or No). | Genetics can increase risk. |
| Diabetes | Whether the individual has diabetes (Yes or No). | Diabetes significantly raises heart risk. |
| BMI | The individual's body mass index. | Higher BMI may indicate obesity-related risk. |
| High Blood Pressure | Whether the individual has high blood pressure (Yes or No). | Hypertension strains the heart. |
| Low HDL Cholesterol | Whether HDL is low (Yes or No). | Low “good cholesterol” raises risk. |
| High LDL Cholesterol | Whether LDL is high (Yes or No). | High “bad cholesterol” leads to plaque buildup. |
| Alcohol Consumption | Level of alcohol intake (None, Low, Medium, High). | Excessive alcohol increases risk. |
| Stress Level | The individual's stress level (Low, Medium, High). | High stress may negatively affect the heart. |
| Sleep Hours | Number of hours the individual sleeps. | Too little sleep is linked to higher risk. |
| Sugar Consumption | Sugar intake level (Low, Medium, High). | High sugar intake harms heart health. |
| Triglyceride Level | The individual's triglyceride level. | High triglycerides (Fats in Blood) raise heart disease risk. |
| Fasting Blood Sugar | Blood sugar after fasting. | Elevated levels indicate diabetes or prediabetes. |
| CRP Level | C-reactive protein level (protein the liver produces), a marker of inflammation. | High CRP signals inflammation linked to heart issues. |
| Homocysteine Level | Homocysteine (amino acid) level, affecting blood vessel health. | High levels may damage blood vessels. |
| Heart Disease Status (Target) | Whether the individual has heart disease (Yes or No). | Target variable indicating heart disease presence. |


## Data Exploration

In [4]:
#Chec kData Types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Sleep Hours           9975 non-null   float64
 15  Sugar Consumption   

In [5]:
#Check duplicates
df.duplicated().sum()

np.int64(0)

In [6]:
#Check Missing Values
df.isna().mean().round(4).sort_values(ascending=False) *100

Alcohol Consumption     25.86
Diabetes                 0.30
Sugar Consumption        0.30
Cholesterol Level        0.30
Age                      0.29
Triglyceride Level       0.26
CRP Level                0.26
High LDL Cholesterol     0.26
High Blood Pressure      0.26
Low HDL Cholesterol      0.25
Sleep Hours              0.25
Exercise Habits          0.25
Smoking                  0.25
Fasting Blood Sugar      0.22
BMI                      0.22
Stress Level             0.22
Family Heart Disease     0.21
Homocysteine Level       0.20
Blood Pressure           0.19
Gender                   0.19
Heart Disease Status     0.00
dtype: float64

In [7]:
# Get missing value percentages
missing_values = df.isna().mean().round(4).sort_values(ascending=False) *100

# Get columns with >0% and <5% missing
missing_value_columns = missing_values[(missing_values > 0) & (missing_values <= 5)].index.tolist()
missing_value_columns

['Diabetes',
 'Sugar Consumption',
 'Cholesterol Level',
 'Age',
 'Triglyceride Level',
 'CRP Level',
 'High LDL Cholesterol',
 'High Blood Pressure',
 'Low HDL Cholesterol',
 'Sleep Hours',
 'Exercise Habits',
 'Smoking',
 'Fasting Blood Sugar',
 'BMI',
 'Stress Level',
 'Family Heart Disease',
 'Homocysteine Level',
 'Blood Pressure',
 'Gender']

In [8]:
#Check Numeric Columns 
df.describe().round()

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
count,9971.0,9981.0,9970.0,9978.0,9975.0,9974.0,9978.0,9974.0,9980.0
mean,49.0,150.0,225.0,29.0,7.0,251.0,120.0,7.0,12.0
std,18.0,18.0,44.0,6.0,2.0,87.0,24.0,4.0,4.0
min,18.0,120.0,150.0,18.0,4.0,100.0,80.0,0.0,5.0
25%,34.0,134.0,187.0,24.0,5.0,176.0,99.0,4.0,9.0
50%,49.0,150.0,226.0,29.0,7.0,250.0,120.0,7.0,12.0
75%,65.0,165.0,263.0,35.0,9.0,326.0,141.0,11.0,16.0
max,80.0,180.0,300.0,40.0,10.0,400.0,160.0,15.0,20.0


In [9]:
#Check Object Columns
df.describe(include='object')

Unnamed: 0,Gender,Exercise Habits,Smoking,Family Heart Disease,Diabetes,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sugar Consumption,Heart Disease Status
count,9981,9975,9975,9979,9970,9974,9975,9974,7414,9978,9970,10000
unique,2,3,2,2,2,2,2,2,3,3,3,2
top,Male,High,Yes,No,No,Yes,Yes,No,Medium,Medium,Low,No
freq,5003,3372,5123,5004,5018,5022,5000,5036,2500,3387,3390,8000


In [10]:
#In depth Checking form numirical Columns --> Check Destribution and Outliers
num_cols=df.select_dtypes(include='number').columns
num_cols
for col in num_cols:
    px.histogram(data_frame=df,x=col,title=col).show()
    px.box(data_frame=df,x=col).show()

In [11]:
### In depth check for categorical columns --> Check unique values and missing values
cat_cols = df.select_dtypes(include= 'object').columns

for col in cat_cols:

    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('*' * 100)


Gender
2
['Male' 'Female' nan]
****************************************************************************************************
Exercise Habits
3
['High' 'Low' 'Medium' nan]
****************************************************************************************************
Smoking
2
['Yes' 'No' nan]
****************************************************************************************************
Family Heart Disease
2
['Yes' 'No' nan]
****************************************************************************************************
Diabetes
2
['No' 'Yes' nan]
****************************************************************************************************
High Blood Pressure
2
['Yes' 'No' nan]
****************************************************************************************************
Low HDL Cholesterol
2
['Yes' 'No' nan]
****************************************************************************************************
High LDL Cholesterol
2
['No' 'Yes' nan]
***

In [12]:
#Check Data Percentage after Dropping All Missing Values except Alcohol Consumption
(df.drop(columns='Alcohol Consumption').dropna().shape[0] / df.drop(columns='Alcohol Consumption').shape[0])*100

95.32000000000001

In [13]:
#Dropping Missing Values
df=df.dropna(subset=missing_value_columns,ignore_index=True)
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
1,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No
2,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
3,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
4,25.0,Male,152.0,257.0,Low,Yes,No,No,28.144681,No,No,No,Low,Medium,5.504876,Low,126.0,91.0,4.297575,10.815983,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9527,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes
9528,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes
9529,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes
9530,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes


In [14]:
#Check unique values befor fill nan
df['Alcohol Consumption'].unique()

array(['Medium', 'Low', nan, 'High'], dtype=object)

In [15]:
#fill nan with 'Most likly Never' For Column "Alcohol Consumption"
df['Alcohol Consumption']=df['Alcohol Consumption'].fillna('Most likly Never')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
1,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No
2,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
3,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No
4,25.0,Male,152.0,257.0,Low,Yes,No,No,28.144681,No,No,No,Low,Medium,5.504876,Low,126.0,91.0,4.297575,10.815983,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9527,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes
9528,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,Most likly Never,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes
9529,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,Most likly Never,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes
9530,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes


## Feature Engineering

In [17]:
def Age_Segment(x):
    if int(x) <20:
        return 'Teenager'
    elif int(x) < 35:
        return 'Young Adult'
    elif int(x) < 50:
        return 'Adult'
    elif int(x) < 66:
        return 'Middle-Aged'
    else:
        return 'Senior'
df['Age_Segment']=df['Age'].apply(Age_Segment)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
px.histogram(data_frame=df,x=df['Age_Segment'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [19]:
def Blood_Pressure_Ranges(x):
    if int(x) < 80:
        return 'Low'
    elif int(x) <= 120:
        return 'Normal  [80-120]'
    elif int(x) < 130:
        return 'Elevated  [120-129]'
    elif int(x) < 140:
        return 'Stage 1 Hypertension  [130-139]'
    else:
        return 'Stage 2 Hypertension [140 & above]'
    
df['Blood_Pressure_Ranges']=df['Blood Pressure'].apply(Blood_Pressure_Ranges)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [20]:
px.histogram(data_frame=df,x=df['Blood_Pressure_Ranges'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [21]:
def sleep_ranges(x):
    if 2< int(x) < 5.9999:
        return 'Light'
    elif 6 < int(x) <= 8:
        return 'Normal'
    else:
        return 'Deep'
    
df['Sleep_Type']=df['Sleep Hours'].apply(sleep_ranges)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
px.histogram(data_frame=df,x=df['Sleep_Type'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [23]:
def BMI(x):
    if int(x) < 18.5:
        return 'Underweight'
    elif int(x) <= 24.9:
        return 'Normal weight'
    elif int(x) <= 29.9:
        return 'Overweight'
    else:
        return 'Obesity'
    
df['BMI categories']=df['BMI'].apply(BMI)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
px.histogram(data_frame=df,x=df['BMI categories'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [25]:
def triglycerides(x):
    if int(x) < 150:
        return 'Normal'
    elif int(x) <= 199:
        return 'Borderline'
    elif int(x) <= 499:
        return 'High'
    else:
        return 'Very high'
    
df['trigly_level']=df['Triglyceride Level'].apply(triglycerides)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [26]:
px.histogram(data_frame=df,x=df['trigly_level'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [27]:
def CRP_level(x):
    if int(x) < 1:
        return 'Normal/Low'
    elif int(x) <= 10:
        return 'Moderate Elevation'
    elif int(x) <= 50:
        return 'Marked Elevation'
    else:
        return 'Severe Elevation'
    
df['CRP_Group']=df['CRP Level'].apply(CRP_level)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
px.histogram(data_frame=df,x=df['CRP_Group'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [29]:
def Homocysteine_Level(x):
    if int(x) < 15:
        return 'Normal'
    elif int(x) <= 30:
        return 'Moderate Elevation'
    else:
        return 'Severe Elevation'
    
df['Homocysteine_Category']=df['Homocysteine Level'].apply(CRP_level)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
px.histogram(data_frame=df,x=df['Homocysteine_Category'],text_auto=True).update_xaxes(categoryorder = 'max descending')

In [31]:
df

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status,Age_Segment,Blood_Pressure_Ranges,Sleep_Type,BMI categories,trigly_level,CRP_Group,Homocysteine_Category
0,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,Yes,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No,Senior,Stage 2 Hypertension [140 & above],Normal,Overweight,Normal,Moderate Elevation,Marked Elevation
1,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,Yes,Yes,Low,Low,4.440440,Low,393.0,92.0,12.709873,11.230926,No,Adult,Elevated [120-129],Light,Overweight,High,Marked Elevation,Marked Elevation
2,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,No,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No,Young Adult,Elevated [120-129],Light,Normal weight,High,Marked Elevation,Moderate Elevation
3,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,No,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No,Middle-Aged,Stage 2 Hypertension [140 & above],Normal,Normal weight,High,Moderate Elevation,Moderate Elevation
4,25.0,Male,152.0,257.0,Low,Yes,No,No,28.144681,No,No,No,Low,Medium,5.504876,Low,126.0,91.0,4.297575,10.815983,No,Young Adult,Stage 2 Hypertension [140 & above],Light,Overweight,Normal,Moderate Elevation,Moderate Elevation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9527,25.0,Female,136.0,243.0,Medium,Yes,No,No,18.788791,Yes,No,Yes,Medium,High,6.834954,Medium,343.0,133.0,3.588814,19.132004,Yes,Young Adult,Stage 1 Hypertension [130-139],Deep,Underweight,High,Moderate Elevation,Marked Elevation
9528,38.0,Male,172.0,154.0,Medium,No,No,No,31.856801,Yes,No,Yes,Most likly Never,High,8.247784,Low,377.0,83.0,2.658267,9.715709,Yes,Adult,Stage 2 Hypertension [140 & above],Normal,Obesity,High,Moderate Elevation,Moderate Elevation
9529,73.0,Male,152.0,201.0,High,Yes,No,Yes,26.899911,No,Yes,Yes,Most likly Never,Low,4.436762,Low,248.0,88.0,4.408867,9.492429,Yes,Senior,Stage 2 Hypertension [140 & above],Light,Overweight,High,Moderate Elevation,Moderate Elevation
9530,23.0,Male,142.0,299.0,Low,Yes,No,Yes,34.964026,Yes,No,Yes,Medium,High,8.526329,Medium,113.0,153.0,7.215634,11.873486,Yes,Young Adult,Stage 2 Hypertension [140 & above],Normal,Obesity,Normal,Moderate Elevation,Marked Elevation


In [32]:
cat_cols_FE = df.select_dtypes(include= 'object').columns

for col in cat_cols_FE:

    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('*' * 100)


Gender
2
['Female' 'Male']
****************************************************************************************************
Exercise Habits
3
['High' 'Low' 'Medium']
****************************************************************************************************
Smoking
2
['No' 'Yes']
****************************************************************************************************
Family Heart Disease
2
['Yes' 'No']
****************************************************************************************************
Diabetes
2
['Yes' 'No']
****************************************************************************************************
High Blood Pressure
2
['No' 'Yes']
****************************************************************************************************
Low HDL Cholesterol
2
['Yes' 'No']
****************************************************************************************************
High LDL Cholesterol
2
['No' 'Yes']
***********************************

In [33]:
df.columns.to_list()

['Age',
 'Gender',
 'Blood Pressure',
 'Cholesterol Level',
 'Exercise Habits',
 'Smoking',
 'Family Heart Disease',
 'Diabetes',
 'BMI',
 'High Blood Pressure',
 'Low HDL Cholesterol',
 'High LDL Cholesterol',
 'Alcohol Consumption',
 'Stress Level',
 'Sleep Hours',
 'Sugar Consumption',
 'Triglyceride Level',
 'Fasting Blood Sugar',
 'CRP Level',
 'Homocysteine Level',
 'Heart Disease Status',
 'Age_Segment',
 'Blood_Pressure_Ranges',
 'Sleep_Type',
 'BMI categories',
 'trigly_level',
 'CRP_Group',
 'Homocysteine_Category']

In [34]:
new_order = ['Gender','Age','Age_Segment','Blood Pressure','Blood_Pressure_Ranges','High Blood Pressure','Stress Level','Cholesterol Level','Low HDL Cholesterol','High LDL Cholesterol',
             'Exercise Habits','Smoking','Diabetes','Sugar Consumption','Fasting Blood Sugar','BMI','BMI categories','Alcohol Consumption','Sleep Hours','Sleep_Type',
             'Triglyceride Level','trigly_level','CRP Level','CRP_Group','Homocysteine Level','Homocysteine_Category','Family Heart Disease','Heart Disease Status']
df=df[new_order]
df

Unnamed: 0,Gender,Age,Age_Segment,Blood Pressure,Blood_Pressure_Ranges,High Blood Pressure,Stress Level,Cholesterol Level,Low HDL Cholesterol,High LDL Cholesterol,Exercise Habits,Smoking,Diabetes,Sugar Consumption,Fasting Blood Sugar,BMI,BMI categories,Alcohol Consumption,Sleep Hours,Sleep_Type,Triglyceride Level,trigly_level,CRP Level,CRP_Group,Homocysteine Level,Homocysteine_Category,Family Heart Disease,Heart Disease Status
0,Female,69.0,Senior,146.0,Stage 2 Hypertension [140 & above],No,High,286.0,Yes,No,High,No,Yes,Medium,157.0,25.221799,Overweight,Medium,8.744034,Normal,133.0,Normal,9.355389,Moderate Elevation,19.298875,Marked Elevation,Yes,No
1,Male,46.0,Adult,126.0,Elevated [120-129],No,Low,216.0,Yes,Yes,Low,No,No,Low,92.0,29.855447,Overweight,Low,4.440440,Light,393.0,High,12.709873,Marked Elevation,11.230926,Marked Elevation,No,No
2,Female,32.0,Young Adult,122.0,Elevated [120-129],Yes,High,293.0,No,Yes,High,Yes,No,High,94.0,24.130477,Normal weight,Low,5.249405,Light,293.0,High,12.509046,Marked Elevation,5.961958,Moderate Elevation,Yes,No
3,Male,60.0,Middle-Aged,166.0,Stage 2 Hypertension [140 & above],Yes,High,242.0,No,No,Low,Yes,Yes,High,154.0,20.486289,Normal weight,Low,7.030971,Normal,263.0,High,10.381259,Moderate Elevation,8.153887,Moderate Elevation,Yes,No
4,Male,25.0,Young Adult,152.0,Stage 2 Hypertension [140 & above],No,Medium,257.0,No,No,Low,Yes,No,Low,91.0,28.144681,Overweight,Low,5.504876,Light,126.0,Normal,4.297575,Moderate Elevation,10.815983,Moderate Elevation,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9527,Female,25.0,Young Adult,136.0,Stage 1 Hypertension [130-139],Yes,High,243.0,No,Yes,Medium,Yes,No,Medium,133.0,18.788791,Underweight,Medium,6.834954,Deep,343.0,High,3.588814,Moderate Elevation,19.132004,Marked Elevation,No,Yes
9528,Male,38.0,Adult,172.0,Stage 2 Hypertension [140 & above],Yes,High,154.0,No,Yes,Medium,No,No,Low,83.0,31.856801,Obesity,Most likly Never,8.247784,Normal,377.0,High,2.658267,Moderate Elevation,9.715709,Moderate Elevation,No,Yes
9529,Male,73.0,Senior,152.0,Stage 2 Hypertension [140 & above],No,Low,201.0,Yes,Yes,High,Yes,Yes,Low,88.0,26.899911,Overweight,Most likly Never,4.436762,Light,248.0,High,4.408867,Moderate Elevation,9.492429,Moderate Elevation,No,Yes
9530,Male,23.0,Young Adult,142.0,Stage 2 Hypertension [140 & above],Yes,High,299.0,No,Yes,Low,Yes,Yes,Medium,153.0,34.964026,Obesity,Medium,8.526329,Normal,113.0,Normal,7.215634,Moderate Elevation,11.873486,Marked Elevation,No,Yes


In [35]:
df.to_csv('HD_NEW.csv')

## Data Analysis

## Univariate Analysis

In [36]:
#What is the distribution of Age in the dataset?
px.histogram(data_frame=df,x='Age',title='Age')

In [37]:
#what is percentage of Age segment in the dataset?
px.pie(data_frame=df,names='Age_Segment',title='Age Segments')


In [38]:
#What are the most common Exercise Habit levels (Low/Medium/High)?
px.histogram(data_frame=df,x='Exercise Habits',text_auto=True).update_xaxes(categoryorder='max descending')

In [39]:
#What is the distribution of Cholesterol Levels across individuals?
px.histogram(data_frame=df,x='Cholesterol Level')

In [40]:
#What is the frequency distribution of Gender?
px.histogram(data_frame=df,x='Gender',text_auto=True).update_xaxes(categoryorder='max descending')

In [41]:
#What is the Percentage of Gender?
px.pie(data_frame=df,names='Gender')

In [42]:
#How are individuals distributed across BMI categories?
px.histogram(data_frame=df,x='BMI')

In [43]:
#Compare Betwen BMI categories
px.histogram(data_frame=df,x='BMI categories',text_auto=True).update_xaxes(categoryorder='max descending')

In [44]:
#Compare Stree levels
px.pie(data_frame=df,names='Stress Level')

In [45]:
#How many individuals fall under each level of Alcohol Consumption?
px.histogram(data_frame=df,x='Alcohol Consumption',text_auto=True).update_xaxes(categoryorder='max descending')

In [46]:
# How many individuals fall into each Stress Level category?
px.histogram(data_frame=df,x='Stress Level',text_auto=True).update_xaxes(categoryorder='max descending')

## Bivariate Analysis

In [47]:
# Is there a relationship between Age and Blood Pressure?
df_AGE_Blood_Pressure=df.groupby('Age')['Blood Pressure'].mean().round().reset_index()
df_AGE_Blood_Pressure[['Age','Blood Pressure']].corr()

Unnamed: 0,Age,Blood Pressure
Age,1.0,-0.254313
Blood Pressure,-0.254313,1.0


In [48]:
px.scatter_matrix(data_frame= df_AGE_Blood_Pressure[['Age','Blood Pressure']])

In [49]:
# How does Gender influence Cholesterol Level?
G_C_df = df.groupby('Gender')['Cholesterol Level'].mean().round(2).reset_index().sort_values(by= 'Cholesterol Level', ascending= False)
G_C_df


Unnamed: 0,Gender,Cholesterol Level
0,Female,225.79
1,Male,224.79


In [50]:
px.bar(data_frame=G_C_df,x='Gender',y='Cholesterol Level',text_auto=True)

In [51]:
#	Is Smoking associated with Heart Disease Status?
px.histogram(data_frame= df, x= 'Heart Disease Status',  barmode= 'group',facet_col='Smoking',text_auto=True).update_xaxes(categoryorder='max descending')

In [52]:
#	What is the relationship between Sleep Hours and Stress Level?
S_S_df = df.groupby('Stress Level')['Sleep Hours'].mean().round(2).reset_index()
S_S_df
px.bar(data_frame=S_S_df,x='Stress Level',y='Sleep Hours',text_auto=True)

In [53]:
# How do Exercise Habits relate to BMI? How does BMI vary across different Exercise Habit levels?
E_B_df = df.groupby('Exercise Habits')['BMI'].mean().round(2).reset_index()
E_B_df
px.bar(data_frame=E_B_df,x='Exercise Habits',y='BMI',text_auto=True).show()
px.histogram(data_frame= df, x= 'Exercise Habits', color= 'BMI categories',barmode= 'group',text_auto= True).update_xaxes(categoryorder='min descending')

In [54]:
# Is High Blood Pressure associated with higher Triglyceride Levels?  
px.histogram(data_frame= df, x= 'Blood_Pressure_Ranges', color= 'trigly_level',barmode= 'group',text_auto= True).update_xaxes(categoryorder='max descending')


In [55]:
# Is High LDL Cholesterol associated with Heart Disease Status?
px.histogram(data_frame= df, x= 'High LDL Cholesterol', color= 'Heart Disease Status',barmode= 'group',text_auto= True).update_xaxes(categoryorder='min descending')
	

In [56]:
# How does Age differ between individuals with and without Heart Disease?
px.histogram(data_frame= df, x= 'Age', color= 'Heart Disease Status',barmode= 'group',text_auto= True).update_xaxes(categoryorder='max descending').show()
px.histogram(data_frame= df, x= 'Age_Segment', color= 'Heart Disease Status',barmode= 'group',text_auto= True).update_xaxes(categoryorder='max descending')


In [57]:
# Is there a relationship between Smoking and Heart Disease Status?
px.histogram(data_frame= df, x= 'Smoking', color= 'Heart Disease Status',barmode= 'group',text_auto= True).update_xaxes(categoryorder='max descending')


In [58]:
# Does Family Heart Disease	lead to Heart Disease?
px.histogram(data_frame= df, x= 'Family Heart Disease', color= 'Heart Disease Status',barmode= 'group',text_auto= True).update_xaxes(categoryorder='min descending')

# Multivariate Analysis


In [59]:
num_cols=df.select_dtypes(include='number').columns.tolist()
num_cols

['Age',
 'Blood Pressure',
 'Cholesterol Level',
 'Fasting Blood Sugar',
 'BMI',
 'Sleep Hours',
 'Triglyceride Level',
 'CRP Level',
 'Homocysteine Level']

In [60]:
df['Heart Disease Binary']=df['Heart Disease Status'].map({'No':0,'Yes':1})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [61]:
# How do Age, Blood Pressure, and Cholesterol Level together influence Heart Disease Status?
px.scatter_matrix(data_frame= df[['Heart Disease Binary','Age','Blood Pressure','Cholesterol Level']],labels={'Cholesterol Level':'Col Level','Heart Disease Binary':'H D','Blood Pressure':'Pressure'}).show()
df[['Heart Disease Binary','Age','Blood Pressure','Cholesterol Level']].corr()

Unnamed: 0,Heart Disease Binary,Age,Blood Pressure,Cholesterol Level
Heart Disease Binary,1.0,-0.005652,-0.014643,-0.000481
Age,-0.005652,1.0,-0.019385,0.013016
Blood Pressure,-0.014643,-0.019385,1.0,-0.011426
Cholesterol Level,-0.000481,0.013016,-0.011426,1.0


In [62]:
# Does a combination of BMI, Triglyceride, and Homocysteine increase the likelihood of Heart Disease?
#['Age','Blood Pressure','Cholesterol Level','Fasting Blood Sugar','BMI','Sleep Hours','Triglyceride Level','CRP Level','Homocysteine Level']
px.scatter_matrix(data_frame= df[['Heart Disease Binary','BMI','Triglyceride Level','Homocysteine Level']],labels={'Triglyceride Level':'Trigly','Heart Disease Binary':'H D','Homocysteine Level':'Homocysteine'}).show()
df[['Heart Disease Binary','BMI','Triglyceride Level','Homocysteine Level']].corr()

Unnamed: 0,Heart Disease Binary,BMI,Triglyceride Level,Homocysteine Level
Heart Disease Binary,1.0,0.021139,0.002314,0.008731
BMI,0.021139,1.0,0.003111,0.006089
Triglyceride Level,0.002314,0.003111,1.0,-0.004191
Homocysteine Level,0.008731,0.006089,-0.004191,1.0


In [63]:
# Can CRP Level, Homocysteine Level, and Fasting Blood Sugar predict Heart Disease Status?
#['Age','Blood Pressure','Cholesterol Level','Fasting Blood Sugar','BMI','Sleep Hours','Triglyceride Level','CRP Level','Homocysteine Level']
px.scatter_matrix(data_frame= df[['Heart Disease Binary','CRP Level','Fasting Blood Sugar','Sleep Hours']],labels={'Fasting Blood Sugar':'Fasting Suger','CRP Level':'CRP','Heart Disease Binary':'H D'}).show()
df[['Heart Disease Binary','CRP Level','Fasting Blood Sugar','Sleep Hours']].corr()


Unnamed: 0,Heart Disease Binary,CRP Level,Fasting Blood Sugar,Sleep Hours
Heart Disease Binary,1.0,-0.006405,-0.005172,0.000302
CRP Level,-0.006405,1.0,0.012314,-0.001124
Fasting Blood Sugar,-0.005172,0.012314,1.0,0.012678
Sleep Hours,0.000302,-0.001124,0.012678,1.0


In [64]:
# Get Category Columns
df.select_dtypes('object').columns.tolist()

['Gender',
 'Age_Segment',
 'Blood_Pressure_Ranges',
 'High Blood Pressure',
 'Stress Level',
 'Low HDL Cholesterol',
 'High LDL Cholesterol',
 'Exercise Habits',
 'Smoking',
 'Diabetes',
 'Sugar Consumption',
 'BMI categories',
 'Alcohol Consumption',
 'Sleep_Type',
 'trigly_level',
 'CRP_Group',
 'Homocysteine_Category',
 'Family Heart Disease',
 'Heart Disease Status']

In [65]:
cat_col=['Heart Disease Status','Gender','Age_Segment','Blood_Pressure_Ranges','High Blood Pressure','Stress Level','Low HDL Cholesterol','High LDL Cholesterol','Exercise Habits','Smoking','Diabetes','Sugar Consumption','BMI categories','Alcohol Consumption','Sleep_Type','trigly_level','CRP_Group','Homocysteine_Category','Family Heart Disease']
cat_col


['Heart Disease Status',
 'Gender',
 'Age_Segment',
 'Blood_Pressure_Ranges',
 'High Blood Pressure',
 'Stress Level',
 'Low HDL Cholesterol',
 'High LDL Cholesterol',
 'Exercise Habits',
 'Smoking',
 'Diabetes',
 'Sugar Consumption',
 'BMI categories',
 'Alcohol Consumption',
 'Sleep_Type',
 'trigly_level',
 'CRP_Group',
 'Homocysteine_Category',
 'Family Heart Disease']

In [66]:
# Does a combination of Smoking, Alcohol Consumption, and Stress Level increase the likelihood of Heart Disease?
#group_df = df.groupby(cat_col)['Age'].count().round(2).reset_index() #---> If I Pass Any Category Column So i can get the Heart Disease Prediction for that category
group_df = df.groupby(['Heart Disease Status','Smoking','Family Heart Disease','Age_Segment','Gender'])['Age'].count().round(2).reset_index()

group_df_1=group_df.rename(columns={"Age": "Per"})
group_df_1
px.scatter_matrix(group_df_1.drop('Per',axis=1))

In [67]:
group_df_1=group_df_1.reset_index(drop=1)
group_df_1['Per']=((group_df_1['Per']/group_df_1['Per'].sum())*100)

In [68]:
group_df_1[group_df_1['Heart Disease Status']=='Yes'].sort_values(by='Per',ascending=False)

Unnamed: 0,Heart Disease Status,Smoking,Family Heart Disease,Age_Segment,Gender,Per
65,Yes,Yes,No,Senior,Male,0.744859
42,Yes,No,No,Middle-Aged,Female,0.713386
64,Yes,Yes,No,Senior,Female,0.702896
41,Yes,No,No,Adult,Male,0.692405
43,Yes,No,No,Middle-Aged,Male,0.692405
70,Yes,Yes,Yes,Adult,Female,0.681914
72,Yes,Yes,Yes,Middle-Aged,Female,0.660932
40,Yes,No,No,Adult,Female,0.660932
78,Yes,Yes,Yes,Young Adult,Female,0.650441
60,Yes,Yes,No,Adult,Female,0.63995


In [69]:
# Analysis for all Category Columns/Heart Disease Affecting Percentage 
#cat_col=['Heart Disease Status','Gender','Age_Segment','Blood_Pressure_Ranges','High Blood Pressure','Stress Level','Low HDL Cholesterol','High LDL Cholesterol','Exercise Habits','Smoking','Diabetes','Sugar Consumption','BMI categories','Alcohol Consumption','Sleep_Type','trigly_level','CRP_Group','Homocysteine_Category','Family Heart Disease']
for col in cat_col:
    print(group_df_1[group_df_1['Heart Disease Status']=='Yes'].groupby(col)['Per'].sum().round(2).reset_index().sort_values(by='Per',ascending=False))
    print('*' * 100) 

  Heart Disease Status    Per
0                  Yes  20.05
****************************************************************************************************
   Gender    Per
0  Female  10.38
1    Male   9.67
****************************************************************************************************
   Age_Segment   Per
1  Middle-Aged  4.98
0        Adult  4.86
2       Senior  4.85
4  Young Adult  4.76
3     Teenager  0.60
****************************************************************************************************


KeyError: 'Blood_Pressure_Ranges'

In [None]:
group_df_1.to_csv('cleaned_df.csv')

import streamlit as st

In [85]:
%%writefile Heart_Disease_deployment.py

import streamlit as st
import pandas as pd
import plotly.express as px

html_title = """<h1 style="color:white;text-align:center;"> <span style="color:red">Heart Disease </span> Risk Factor Exploratory Data Analysis </h1>"""
st.markdown(html_title, unsafe_allow_html=True)
    # Set Title 
st.set_page_config(layout='wide', page_title= 'Heart Disease Risk Factor EDA',page_icon='💔')

page = st.sidebar.radio('Page', ['Home','Dash Board', 'Statistics', 'Dynamic Reports'])

df = pd.read_csv('cleaned_df.csv', index_col= 0)

if page == 'Home':



    # Insert Image
    col1, col2, col3 = st.columns([1,2,1])

    
    col2.image(width=1000,image='https://www.riversidehealthcare.org/sites/default/files/healthcurrents/GettyImages-1344030014.jpg')


    st.header('Dataset Overview')
    col1, col2 = st.columns([2,1])

    col1.dataframe(df,height=1100)

        # Create table of column descriptions
    data = {
        "Column Name": [
            "Gender", "Age", "Age_Segment", "Blood Pressure",
            "Blood_Pressure_Ranges", "High Blood Pressure", "Stress Level",
            "Cholesterol Level", "Low HDL Cholesterol", "High LDL Cholesterol","Exercise Habits","Smoking","Diabetes","Sugar Consumption",
            "Fasting Blood Sugar", "BMI", "BMI categories", "Alcohol Consumption",
            "Sleep Hours", "Sleep_Type", "Triglyceride Level", "trigly_level",
            "CRP Level", "CRP_Group", "Homocysteine Level",
            "Homocysteine_Category", "Family Heart Disease",
            "Heart Disease Status", "Heart Disease Binary"
        ],
        "Description": [
            "Biological sex of the individual",
            "Age in years",
            "Categorized age segment",
            "Systolic blood pressure (mm/Hg)",
            "Blood pressure classification",
            "Whether the person has high BP",
            "Self-reported stress level",
            "Total cholesterol (mg/dL)",
            "Indicates low HDL cholesterol",
            "Indicates High HDL cholesterol",
            "Exercise habits (Low, Medium, High)",
            "Smoker or not (Yes or No)",
            "Diabetes or not (Yes or No)",
            "Daily sugar intake level",
            "Fasting blood glucose level",
            "Body Mass Index",
            "BMI classification",
            "Alcohol consumption level",
            "Average daily sleep duration",
            "Sleep quality category",
            "Triglyceride level (mg/dL)",
            "Triglyceride range classification",
            "C-Reactive Protein level",
            "CRP classification",
            "Homocysteine level",
            "Homocysteine category",
            "Family history of heart disease",
            "Heart disease type/status",
            "Binary heart disease indicator"
        ]
    }

    desc_df = pd.DataFrame(data)

    # Display table
    col2.subheader("📝 Column Descriptions")
    col2.table(desc_df)

elif page == 'Statistics':

    #col1, col2 = st.columns([1,1])

    #with col1:
    cat_col = [
        'Heart Disease Status','Gender','Age_Segment','Blood_Pressure_Ranges',
        'High Blood Pressure','Stress Level','Low HDL Cholesterol','High LDL Cholesterol',
        'Exercise Habits','Smoking','Diabetes','Sugar Consumption','BMI categories',
        'Alcohol Consumption','Sleep_Type','trigly_level','CRP_Group',
        'Homocysteine_Category','Family Heart Disease'
    ]

    st.title("Categorical Analysis for Heart Disease")

    for col in cat_col:
        result = df[df['Heart Disease Status'] == 'Yes'].groupby(col)['Per'].sum().round(2).reset_index().sort_values('Per', ascending=False)
        
        with st.expander(f"📊 {col}"):
            col1, col2 = st.columns([1,1])

            #st.dataframe(result)
            col1.dataframe(result)
            Chart_Type=col1.radio('Chart Type :',options=['Histrogram','Pie'],key=col)
            if Chart_Type=='Histrogram':

                col2.plotly_chart(px.histogram(result, x=col, y='Per',text_auto= True, title=f"{col}"))
            else :
                col2.plotly_chart(px.pie(result, names=col, values='Per', title=f"{col}"))


elif page == 'Dash Board':

    cat_col = [
        'Heart Disease Status','Gender','Age_Segment','Blood_Pressure_Ranges',
        'High Blood Pressure','Stress Level','Low HDL Cholesterol','High LDL Cholesterol',
        'Exercise Habits','Smoking','Diabetes','Sugar Consumption','BMI categories',
        'Alcohol Consumption','Sleep_Type','trigly_level','CRP_Group',
        'Homocysteine_Category','Family Heart Disease'
    ]

    st.title("Categorical Analysis for Heart Disease")

    for col in cat_col:
        result = df[df['Heart Disease Status'] == 'Yes'].groupby(col)['Per'].sum().round(2).reset_index().sort_values('Per', ascending=False)
        
        col1, col2 = st.columns([1,1])
        
        col2.dataframe(result)
        Chart_Type=col2.radio('Chart Type :',options=['Histrogram','Pie'],key=col)

        if Chart_Type=='Histrogram':
            col1.plotly_chart(px.histogram(result, x=col, y='Per',text_auto= True, title=f"{col}"))
        else :
            col1.plotly_chart(px.pie(result, names=col, values='Per', title=f"{col}"))

        
        co1,col2,col3 =st.columns([1,2,1])

        col2.write(f"### 💗 ▂▃▅▇ {col.upper()} ANALYSIS ▇▅▃▂ 💗")
        st.write("-----")


elif page =='Dynamic Reports' :

    col1, col2 = st.columns([3,1])

    All_gender =  ['Choose'] + df.Gender.unique().tolist() 
    Gender = st.sidebar.selectbox('Gender', All_gender)

    age_group=st.sidebar.selectbox('Age Group', ['All Age Groups','Teenager', 'Young Adult','Adult', 'Middle-Aged','Senior'])

    Boold_pressure_ranges =  ['All Ranges'] + ['Normal  [80-120]','Elevated  [120-129]','Stage 1 Hypertension  [130-139]','Stage 2 Hypertension [140 & above]'] 
    
    Boold_pressure_ranges = st.sidebar.selectbox('Blood Pressure Ranges', Boold_pressure_ranges)

    Alcohol_Consumption =  ['All'] +  ['Most likly Never','Low','Medium','High']
    
    Alcohol_Consumption = st.sidebar.selectbox('Alcohol Consumption', Alcohol_Consumption)

    BMI_categories =  ['All Categories'] + ['Underweight','Normal weight','Overweight','Obesity'] 
    
    BMI_categories = st.sidebar.selectbox('BMI categories', BMI_categories)

    CRP_Group =  ['All Groups'] + ['Normal/Low', 'Marked Elevation','Moderate Elevation']
    
    CRP_Group = st.sidebar.selectbox('CRP Group', CRP_Group)    
    
    Homocysteine_Category =  ['All Categories'] + ['Marked Elevation', 'Moderate Elevation','Severe Elevation'] 
    
    Homocysteine_Category = st.sidebar.selectbox('Homocysteine categories', Homocysteine_Category)


    Stress=st.sidebar.radio('Stess Level', options=['All'] + ['Low','Medium','High'],horizontal=True )

    Exercise_Habits=st.sidebar.radio('Exercise Habits', options=['All'] +  ['Low','Medium','High'],horizontal=True )

    Smoker=st.sidebar.radio('Smoker',options=['All','Yes','No'],horizontal=True)

    Diabetes=st.sidebar.radio('Diabetes',options=['All','Yes','No'],horizontal=True)

    Sleep=st.sidebar.radio('Sleep Type',options=['Any','Normal', 'Light', 'Deep'],horizontal=True)

    trigly_level = st.sidebar.radio('Triglyceride Level', ['All'] + ['Normal','High','Borderline'],horizontal=True)


    Family_Heart_Disease=st.sidebar.radio('Family Heart History',options=['All','Yes','No'],horizontal=True)


    dF_select=col2.multiselect('Data Frame',df.drop(columns=['Per','Heart Disease Status'],axis=1).columns,max_selections=26,default=df.drop(columns=['Per','Heart Disease Status'],axis=1).columns)

    if dF_select != '':

        df_filtered_col = ['Heart Disease Status'] + dF_select

        df_filtered=df[df['Heart Disease Status']=='Yes'].groupby(df_filtered_col)['Per'].sum()
        df_filtered=df_filtered.reset_index().sort_values(by='Per',ascending=False)

    
    if Gender != 'Choose' and 'Gender' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Gender'] == Gender]

    if Smoker != 'All' and 'Smoking' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Smoking'] == Smoker]

    if age_group != 'All Age Groups' and 'Age_Segment' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Age_Segment'] == age_group]

    if Boold_pressure_ranges != 'All Ranges' and 'Blood_Pressure_Ranges' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Blood_Pressure_Ranges'] == Boold_pressure_ranges]


    if Alcohol_Consumption != 'All' and 'Alcohol Consumption' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Alcohol Consumption'] == Alcohol_Consumption]       


    if BMI_categories != 'All Categories' and 'BMI categories' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['BMI categories'] == BMI_categories]  

    if CRP_Group != 'All Groups' and 'CRP_Group' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['CRP_Group'] == CRP_Group]  

    if Homocysteine_Category != 'All Categories' and 'Homocysteine_Category' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Homocysteine_Category'] == Homocysteine_Category]  

    if Stress != 'All' and 'Stress Level' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Stress Level'] == Stress]  
 
    if Exercise_Habits != 'All' and 'Exercise Habits' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Exercise Habits'] == Exercise_Habits]     

    if Diabetes != 'All' and 'Diabetes' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Diabetes'] == Diabetes]  

    if Sleep != 'Any' and 'Sleep_Type' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Sleep_Type'] == Sleep]  

    if trigly_level != 'All' and 'trigly_level' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['trigly_level'] == trigly_level] 


    if Family_Heart_Disease != 'All' and 'Family Heart Disease' in df_filtered_col:

        df_filtered = df_filtered[df_filtered['Family Heart Disease'] == Family_Heart_Disease] 


    if dF_select != '':
        col1.dataframe(df_filtered)
        st.plotly_chart(px.histogram(df_filtered, x=dF_select, y='Per',barmode='overlay',height=700,text_auto= True))
        #for col in dF_select:
         #   Chart_Type=st.radio('Chart Type :',options=['Histrogram','Pie'],key=col)

            #if Chart_Type=='Histrogram':
             #   st.plotly_chart(px.histogram(df_filtered, x=['Age_Segment'], y='Per',color='Smoking',barmode='group',facet_col='Gender',text_auto= True, title=f"{col}"))
            #else :
            #    st.plotly_chart(px.pie(df_filtered, names=col, values='Per', title=f"{col}"))
            
    else :
        col1.dataframe(df)





Overwriting Heart_Disease_deployment.py


In [70]:
! streamlit run Heart_Disease_deployment.py

^C


In [None]:
import plotly.io as pio
# Reset to Default theme
pio.templates.default='plotly'

