# Heart Disease Risk Analysis Data

- Download the CDC data from this location

  https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease/download?datasetVersionNumber=6

- Unzip the file
- Use the 2020/heart_2020_cleaned.csv file as this file has the HeartDisease Yes/No value
  

### Prepare the dataset

**Preparation:**

- Load the 2020/heart_2020_cleaned.csv
- Fill missing values with zeros.
- Set all the column names to lowercase
- Do train/validation/test split with 60%/20%/20% distribution.
- Use the train_test_split function and set the random_state parameter to 1.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split



In [34]:
# open the csv file and read it into a pandas dataframe to understand the data
df = pd.read_csv('./data/2020/heart_2020_cleaned.csv', sep=',', quotechar='"')

# set all the column names to lowercase
df.columns = map(str.lower, df.columns)

# filling missing values with 0
df = df.fillna(0)

# display the columns and their data types
print(df.dtypes)

df.head()

heartdisease         object
bmi                 float64
smoking              object
alcoholdrinking      object
stroke               object
physicalhealth      float64
mentalhealth        float64
diffwalking          object
sex                  object
agecategory          object
race                 object
diabetic             object
physicalactivity     object
genhealth            object
sleeptime           float64
asthma               object
kidneydisease        object
skincancer           object
dtype: object


Unnamed: 0,heartdisease,bmi,smoking,alcoholdrinking,stroke,physicalhealth,mentalhealth,diffwalking,sex,agecategory,race,diabetic,physicalactivity,genhealth,sleeptime,asthma,kidneydisease,skincancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [38]:
# get a list of numeric features
features_numeric = list(df.select_dtypes(include=[np.number]).columns)

# get a list of object features and exclude the target feature 'heartdisease'
features_category = list(df.select_dtypes(include=['object']).columns)

# remove the target feature from the list of categorical features
target = 'heartdisease'

features_category.remove(target)

print('Categorical features',features_category)
print('Numerical features',features_numeric)

# convert Y/N values to 1/0 by mapping the values in lowercase first and then converting them to 1/0
df[target] = df[target].str.lower().map({'yes': 1, 'no': 0})

all_features = [target] + features_category + features_numeric  


Categorical features ['smoking', 'alcoholdrinking', 'stroke', 'diffwalking', 'sex', 'agecategory', 'race', 'diabetic', 'physicalactivity', 'genhealth', 'asthma', 'kidneydisease', 'skincancer']
Numerical features ['bmi', 'physicalhealth', 'mentalhealth', 'sleeptime']


In [40]:
# Initialize the HeartDiseaseLogisticRegression class
from heart_disease_logistic_regression import HeartDiseaseLogisticRegression

# split the data in train/val/test sets, with 60%/20%/20% distribution with seed 1
hd_logistic_reg_model = HeartDiseaseLogisticRegression(features_numeric, features_category, target)

X_train, X_val, X_test = hd_logistic_reg_model.split_data(df, all_features)

# print the shape of the train/val/test sets
print("X_train shape: ", X_train.shape)
print("X_val shape: ", X_val.shape)
print("X_test shape: ", X_test.shape)

X_train.head()

X_train shape:  (191877, 18)
X_val shape:  (63959, 18)
X_test shape:  (63959, 18)


Unnamed: 0,heartdisease,smoking,alcoholdrinking,stroke,diffwalking,sex,agecategory,race,diabetic,physicalactivity,genhealth,asthma,kidneydisease,skincancer,bmi,physicalhealth,mentalhealth,sleeptime
0,0,Yes,No,No,Yes,Female,75-79,Black,No,Yes,Good,No,No,No,23.34,0.0,0.0,7.0
1,1,Yes,No,Yes,Yes,Male,65-69,White,Yes,No,Fair,No,No,No,31.87,0.0,0.0,4.0
2,0,Yes,No,No,Yes,Male,65-69,White,Yes,No,Poor,Yes,No,No,27.37,30.0,0.0,6.0
3,0,No,No,No,No,Female,60-64,White,No,Yes,Excellent,No,No,No,27.44,0.0,0.0,8.0
4,0,No,No,No,No,Female,50-54,White,No,Yes,Excellent,No,No,No,27.41,0.0,0.0,6.0


In [41]:
# Train the model
hd_logistic_reg_model.train(X_train)

# Evaluate the model
metrics = hd_logistic_reg_model.evaluate(X_val)
print(f'model metrics: {metrics}')




ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets