In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read Data from AWS

In [3]:
# Import module from sqlalchemy 
from sqlalchemy import create_engine


#define local server password
#from config import local_db_password
from config import db_password

# create database engine
#db_string = f"postgresql://postgres:{local_db_password}@127.0.0.1:5432/postgres"

db_string = f"postgresql://postgres:{db_password}@cardiovasculardb.ctyxqkz5om6e.us-east-1.rds.amazonaws.com:5432/"

engine = create_engine(db_string)



In [4]:
cardio_df = pd.read_sql("SELECT a.patient_id, b.gender, a.age_group, a.bmi_range, c.ap_hi,\
        c.ap_lo, c.cholesterol, c.gluc, \
        d.smoke, d.alco, d.active, b.cardio \
FROM calculated_table AS a \
INNER JOIN patient_table AS b \
         ON a.patient_id = b.patient_id  \
INNER JOIN health_factors_table AS c \
         ON a.patient_id = c.patient_id \
INNER JOIN lifestyle_table AS d    \
         ON a.patient_id = d.patient_id", con=engine)
cardio_df.head()

Unnamed: 0,patient_id,gender,age_group,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,3.0,2.0,40-49,Overweight (2),150.0,100.0,1.0,1.0,0.0,0.0,1.0,1.0
1,4.0,1.0,40-49,Healthy (1),100.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0
2,12.0,2.0,60-89,Obesity (3),130.0,90.0,3.0,3.0,0.0,0.0,1.0,1.0
3,14.0,1.0,50-59,Overweight (2),110.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0
4,21.0,1.0,50-59,Obesity (3),110.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0


In [5]:
# Remove outliers in ap_hi
q_hi  = cardio_df["ap_hi"].quantile(0.99)

df_filtered = cardio_df[(cardio_df["ap_hi"] < q_hi)]

In [6]:
# Remove outliers in ap_lo
q_hi  = cardio_df["ap_lo"].quantile(0.99)

cardio_filtered = df_filtered[(df_filtered["ap_lo"] < q_hi)]

In [7]:


# encode age groups
cardio_filtered['age_group_encoded'] = cardio_df['age_group'].map( {"<40":0, "40-49":1, "50-59":2, "60-89":3})

In [8]:

# encode the bmi status
cardio_filtered['bmi_range_encoded'] = cardio_df['bmi_range'].map( {"Underweight (0)":0, "Healthy (1)":1, "Overweight (2)":2, "Obesity (3)":3})

In [9]:
# Ranges based on https://www.healthline.com/health/high-blood-pressure-hypertension/blood-pressure-reading-explained
# Create ap_hi ranges
size_bins = [0,119,129,139,179]
group_names = ["normal","elevated","high_blood_pressure_1","high_blood_pressure_2"]
cardio_filtered["ap_hi_status"] = pd.cut(cardio_filtered["ap_hi"], size_bins, labels = group_names)

# encode the ap_hi_status
cardio_filtered['ap_hi_status_encoded'] = cardio_filtered['ap_hi_status'].map( {"normal":0, "elevated":1, "high_blood_pressure_1":2, "high_blood_pressure_2":3})

# Create ap_lo ranges
size_bins = [0,79,89,119,910]
group_names = ["normal","high_blood_pressure_1","high_blood_pressure_2","hypertensive_crisis"]
cardio_filtered["ap_lo_status"] = pd.cut(cardio_filtered["ap_lo"], size_bins, labels = group_names)

# encode the ap_lo_status
cardio_filtered['ap_lo_status_encoded'] = cardio_filtered['ap_lo_status'].map( {"normal":0,"high_blood_pressure_1":1, "high_blood_pressure_2":2,"hypertensive_crisis":3})

In [10]:
cardio_filtered.head(10)

Unnamed: 0,patient_id,gender,age_group,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_group_encoded,bmi_range_encoded,ap_hi_status,ap_hi_status_encoded,ap_lo_status,ap_lo_status_encoded
0,3.0,2.0,40-49,Overweight (2),150.0,100.0,1.0,1.0,0.0,0.0,1.0,1.0,1,2,high_blood_pressure_2,3,high_blood_pressure_2,2
1,4.0,1.0,40-49,Healthy (1),100.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0,1,1,normal,0,normal,0
2,12.0,2.0,60-89,Obesity (3),130.0,90.0,3.0,3.0,0.0,0.0,1.0,1.0,3,3,high_blood_pressure_1,2,high_blood_pressure_2,2
3,14.0,1.0,50-59,Overweight (2),110.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0,2,2,normal,0,normal,0
4,21.0,1.0,50-59,Obesity (3),110.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,2,3,normal,0,normal,0
5,23.0,2.0,<40,Overweight (2),130.0,90.0,1.0,1.0,1.0,1.0,1.0,0.0,0,2,high_blood_pressure_1,2,high_blood_pressure_2,2
6,25.0,1.0,50-59,Overweight (2),130.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0,2,2,high_blood_pressure_1,2,normal,0
7,27.0,1.0,40-49,Healthy (1),110.0,70.0,1.0,3.0,0.0,0.0,1.0,0.0,1,1,normal,0,normal,0
8,28.0,1.0,40-49,Overweight (2),100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0,1,2,normal,0,normal,0
9,35.0,1.0,40-49,Healthy (1),150.0,90.0,3.0,1.0,0.0,0.0,1.0,1.0,1,1,high_blood_pressure_2,3,high_blood_pressure_2,2


# Split the Data into Training and Testing

In [13]:
# drop the irrelevent columns from the dataframe
cardio_train_df = cardio_filtered.drop(columns=['patient_id','age_group','bmi_range','ap_hi','ap_hi_status','ap_lo','ap_lo_status'])
cardio_train_df.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group_encoded,bmi_range_encoded,ap_hi_status_encoded,ap_lo_status_encoded
0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,1,2,3,2
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1,1,0,0
2,2.0,3.0,3.0,0.0,0.0,1.0,1.0,3,3,2,2
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2,2,0,0
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,2,3,0,0


In [17]:
# Create our features
X = pd.get_dummies(cardio_train_df, columns=['age_group_encoded','gender', 'bmi_range_encoded', 'ap_hi_status_encoded', 
                'ap_lo_status_encoded', 'cholesterol', 'gluc', 'smoke', 
                'alco','active']).drop('cardio', axis=1)

# Create our target
y = cardio_train_df['cardio']

In [18]:
# Check the balance of our target values
y.value_counts()

0.0    34378
1.0    32783
Name: cardio, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
# Starting split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.7, test_size=0.3, stratify=y)
# Optimization try 1.1 Best results for SO, no change in BRF
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.8, test_size=0.2, stratify=y) 
# Optimization try 1.2
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Balanced Random Forest Classifier

In [20]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# Starting Classifier
brf_model = BalancedRandomForestClassifier(n_estimators=150, random_state=1)
# Optimization try 2.1
# brf_model = BalancedRandomForestClassifier(n_estimators=175, random_state=10)
# Optimization try 2.2
# brf_model = BalancedRandomForestClassifier(n_estimators=200, random_state=1)

# Fitting the model
brf_model.fit(X_train, y_train, sample_weight=None)
Counter(y_train)

Counter({1.0: 26226, 0.0: 27502})

### Evaluate the Model

In [21]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
print(f" Balanced Random Forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Balanced Random Forest predictive accuracy: 0.709


In [22]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,5166,1710
Actual Cardio Absence,2195,4362


In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.70      0.75      0.67      0.73      0.71      0.50      6876
        1.0       0.72      0.67      0.75      0.69      0.71      0.50      6557

avg / total       0.71      0.71      0.71      0.71      0.71      0.50     13433



In [24]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.18760013521087313, 'ap_hi_status_encoded_3'),
 (0.11182780111545154, 'ap_lo_status_encoded_2'),
 (0.08779444762801705, 'ap_hi_status_encoded_0'),
 (0.08090896588146118, 'ap_hi_status_encoded_1'),
 (0.050512441025660625, 'cholesterol_3.0'),
 (0.04854834568632894, 'age_group_encoded_3'),
 (0.04454660558965553, 'ap_lo_status_encoded_0'),
 (0.03659160463632902, 'cholesterol_1.0'),
 (0.031779866619977365, 'ap_hi_status_encoded_2'),
 (0.02809336951412339, 'ap_lo_status_encoded_1'),
 (0.02361179849518651, 'bmi_range_encoded_3'),
 (0.023549336722448647, 'age_group_encoded_1'),
 (0.020513874580301527, 'age_group_encoded_2'),
 (0.01850000310902713, 'bmi_range_encoded_1'),
 (0.01742513496075163, 'gluc_1.0'),
 (0.0172657260032486, 'age_group_encoded_0'),
 (0.016810379057864488, 'bmi_range_encoded_2'),
 (0.01628905714097519, 'gender_2.0'),
 (0.016212269864655664, 'gender_1.0'),
 (0.015660756470644462, 'active_0.0'),
 (0.015272346367550942, 'active_1.0'),
 (0.014859337398307774, 'cholesterol_2.0

# SMOTE Oversampling

In [25]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
# Starting Resample
smote = SMOTE(random_state=1) # Best accuracy score
# Optimization try 3.1
# smote = SMOTE(random_state=42)
# Optimization try 3.2
# smote = SMOTE(random_state=20)

# Fitting the model
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1.0: 27502, 0.0: 27502})

In [26]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_resampled, y_resampled, sample_weight=None)

LogisticRegression(random_state=1)

### Evaluate the Model

In [27]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTE Oversampling predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTE Oversampling predictive accuracy: 0.719


In [28]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,5484,1392
Actual Cardio Absence,2387,4170


In [29]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.70      0.80      0.64      0.74      0.71      0.52      6876
        1.0       0.75      0.64      0.80      0.69      0.71      0.50      6557

avg / total       0.72      0.72      0.71      0.72      0.71      0.51     13433

