# Heart Disease Risk Analysis Data - Data Processing

### Process the data

- Load the 2020/heart_2020_eda.csv
- Process the values
  - Convert Yes/No features to binary (1/0)
  - Cast all the numeric values to int to avoid float problems
- Process the features
  - Set the categorical features names
  - Set the numeric features names  
  - Set the target variable
- Feature importance analysis
 - Use statistical analysis to get the numbers


- Do train/validation/test split with 60%/20%/20% distribution.
- Use the train_test_split function and set the random_state parameter to 1.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [15]:
# open the csv file and read it into a pandas dataframe to understand the data
df_source = pd.read_csv('./data/2020/heart_2020_processed.csv', sep=',', quotechar='"')

# save the original set of data
df = df_source.copy()

df.head()

Unnamed: 0,heartdisease,bmi,smoking,alcoholdrinking,stroke,physicalhealth,mentalhealth,diffwalking,sex,agecategory,race,diabetic,physicalactivity,genhealth,sleeptime,asthma,kidneydisease,skincancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [38]:
# Process the features

# set the target feature
target = 'heartdisease'

# Set a list of numeric features but exclude the target feature
num_features = list(df.select_dtypes(include=[np.number]).columns)

# get a list of categorical features 
cat_features = list(df.select_dtypes(include=['object']).columns)

# remove the target feature from the list of numeric features
if target in num_features:
    num_features.remove(target)

print('Categorical features',cat_features)
print('Numerical features',num_features)
print('Target feature',target)

# create a list of all features
all_features = cat_features + num_features  


Categorical features ['sex', 'agecategory', 'race', 'diabetic', 'genhealth']
Numerical features ['bmi', 'smoking', 'alcoholdrinking', 'stroke', 'physicalhealth', 'mentalhealth', 'diffwalking', 'physicalactivity', 'sleeptime', 'asthma', 'kidneydisease', 'skincancer']
Target feature heartdisease


In [None]:
# Initialize the HeartDiseaseLogisticRegression class
from heart_disease_logistic_regression import HeartDiseaseLogisticRegression

# split the data in train/val/test sets, with 60%/20%/20% distribution with seed 1
hd_logistic_reg_model = HeartDiseaseLogisticRegression(features_numeric, features_category, [target])

X_train, X_val, X_test = hd_logistic_reg_model.split_data(df, all_features)

# print the shape of the train/val/test sets
print("X_train shape: ", X_train.shape)
print("X_val shape: ", X_val.shape)
print("X_test shape: ", X_test.shape)

X_train.head()

In [None]:
# Train the model
hd_logistic_reg_model.train(X_train)

# Evaluate the model
data, metrics = hd_logistic_reg_model.evaluate(X_val)
print(f'model metrics: {metrics}')


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

y_val = data['y_val']
y_pred = data['y_pred']

# print(f'val: {y_val} pred {y_pred}')

# change the data to binary and calculate the precision
thresholds = np.arange(0, 1, 0.1)
for threshold in thresholds:
    y_pred_binary = np.where(y_pred > threshold, 1, 0)
    accuracy = accuracy_score(y_val, y_pred_binary)
    precision = precision_score(y_val, y_pred_binary)
    recall = recall_score(y_val, y_pred_binary)
    f1 = f1_score(y_val, y_pred_binary)

    print(f'threshold: {threshold} accuracy: {accuracy:.3f} precision: {precision:.3f} recall: {recall:.3f} f1: {f1:.3f}')

# - accuracy: 0.915 precision: 0.373 recall: 0.374 f1: 0.374