In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the Cardio CSV

In [3]:
# Load the data
file_path = Path('Resources/cardio_train.csv')
cardio_df = pd.read_csv(file_path, sep=';')

cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
cardio_df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [5]:
# convert age to years
cardio_df['age_year'] = (cardio_df['age']/365).round(0)

In [6]:
# Remove outliers in ap_hi
q_hi  = cardio_df["ap_hi"].quantile(0.99)

df_filtered = cardio_df[(cardio_df["ap_hi"] < q_hi)]

In [7]:
# Remove outliers in ap_lo
q_hi  = cardio_df["ap_lo"].quantile(0.99)

cardio_filtered = df_filtered[(df_filtered["ap_lo"] < q_hi)]

In [8]:
# Create age groups
size_bins = [0,39,49,59,90]
group_names = ["<40","40-49", "50-59", "60-89"]
cardio_filtered["age_group"] = pd.cut(cardio_filtered["age_year"], size_bins, labels = group_names)

# encode age groups
cardio_filtered['age_group_encoded'] = cardio_filtered['age_group'].map( {"<40":0, "40-49":1, "50-59":2, "60-89":3})

In [9]:
#converting height into meters
height_meters = cardio_filtered["height"]/100

# calculate the bmi
cardio_filtered["bmi"] = round(cardio_filtered["weight"]/(height_meters**2),2)

# Create bmi health status
size_bins = [0,18.4,24.9,29.9,300]
group_names = ["underweight","healthy","overweight","obese"]
cardio_filtered["bmi_status"] = pd.cut(cardio_filtered["bmi"], size_bins, labels = group_names)

# encode the bmi status
cardio_filtered['bmi_status_encoded'] = cardio_filtered['bmi_status'].map( {"underweight":0, "healthy":1, "overweight":2, "obese":3})

In [10]:
# Ranges based on https://www.healthline.com/health/high-blood-pressure-hypertension/blood-pressure-reading-explained
# Create ap_hi ranges
size_bins = [0,119,129,139,179]
group_names = ["normal","elevated","high_blood_pressure_1","high_blood_pressure_2"]
cardio_filtered["ap_hi_status"] = pd.cut(cardio_filtered["ap_hi"], size_bins, labels = group_names)

# encode the ap_hi_status
cardio_filtered['ap_hi_status_encoded'] = cardio_filtered['ap_hi_status'].map( {"normal":0, "elevated":1, "high_blood_pressure_1":2, "high_blood_pressure_2":3})

# Create ap_lo ranges
size_bins = [0,79,89,119,910]
group_names = ["normal","high_blood_pressure_1","high_blood_pressure_2","hypertensive_crisis"]
cardio_filtered["ap_lo_status"] = pd.cut(cardio_filtered["ap_lo"], size_bins, labels = group_names)

# encode the ap_lo_status
cardio_filtered['ap_lo_status_encoded'] = cardio_filtered['ap_lo_status'].map( {"normal":0,"high_blood_pressure_1":1, "high_blood_pressure_2":2,"hypertensive_crisis":3})

In [11]:
cardio_filtered.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,bmi
count,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0
mean,49978.6314,19452.283475,1.347895,164.367133,73.993578,125.515621,81.389276,1.360676,1.223854,0.087355,0.053209,0.803513,0.490029,53.293425,27.473469
std,28858.272863,2469.939606,0.476306,8.18819,14.22958,16.603705,18.007625,0.67627,0.569993,0.282357,0.224451,0.397344,0.499904,6.772594,6.020381
min,0.0,10798.0,1.0,55.0,11.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,30.0,3.47
25%,24980.5,17641.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0,23.88
50%,50026.0,19694.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0,26.3
75%,74891.5,21316.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,58.0,30.11
max,99999.0,23713.0,2.0,250.0,200.0,179.0,910.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0,298.67


# Split the Data into Training and Testing

In [12]:
# drop the irrelevent columns from the dataframe
cardio_train_df = cardio_filtered.drop(columns=['id','age','age_group','height','weight','bmi','bmi_status','age_year','ap_hi','ap_hi_status','ap_lo','ap_lo_status'])
cardio_train_df.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group_encoded,bmi_status_encoded,ap_hi_status_encoded,ap_lo_status_encoded
0,2,1,1,0,0,1,0,2,1,0,1
1,1,3,1,0,0,1,1,2,3,3,2
2,1,3,1,0,0,0,1,2,1,2,0
3,2,1,1,0,0,1,1,1,2,3,2
4,1,1,1,0,0,0,0,1,1,0,0


In [13]:
cardio_train_df.describe()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio
count,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0
mean,1.347895,1.360676,1.223854,0.087355,0.053209,0.803513,0.490029
std,0.476306,0.67627,0.569993,0.282357,0.224451,0.397344,0.499904
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,2.0,1.0,1.0,0.0,0.0,1.0,1.0
max,2.0,3.0,3.0,1.0,1.0,1.0,1.0


In [14]:
# Create our features
X = pd.get_dummies(cardio_train_df, columns=['age_group_encoded','gender', 'bmi_status_encoded', 'ap_hi_status_encoded', 
                'ap_lo_status_encoded', 'cholesterol', 'gluc', 'smoke', 
                'alco','active']).drop('cardio', axis=1)

# Create our target
y = cardio_train_df['cardio']

In [15]:
X.describe()

Unnamed: 0,age_group_encoded_0,age_group_encoded_1,age_group_encoded_2,age_group_encoded_3,gender_1,gender_2,bmi_status_encoded_0,bmi_status_encoded_1,bmi_status_encoded_2,bmi_status_encoded_3,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
count,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,...,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0,68147.0
mean,0.005958,0.264135,0.504498,0.22541,0.652105,0.347895,0.00854,0.358813,0.371887,0.26076,...,0.113373,0.851718,0.07271,0.075572,0.912645,0.087355,0.946791,0.053209,0.196487,0.803513
std,0.076956,0.440875,0.499983,0.417855,0.476306,0.476306,0.092019,0.479656,0.483312,0.439052,...,0.31705,0.355382,0.259663,0.264314,0.282357,0.282357,0.224451,0.224451,0.397344,0.397344
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
# Check the balance of our target values
y.value_counts()

0    34753
1    33394
Name: cardio, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.7, test_size=0.3, stratify=y)

## Balanced Random Forest Classifier

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=150, random_state=1)

# Fitting the model
brf_model.fit(X_train, y_train)
Counter(y_train)

Counter({1: 23375, 0: 24327})

In [19]:
# Evaluate the model
y_pred = brf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.716


In [20]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,7783,2643
Actual Cardio Absence,3171,6848


In [21]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.71      0.75      0.68      0.73      0.71      0.51     10426
          1       0.72      0.68      0.75      0.70      0.71      0.51     10019

avg / total       0.72      0.72      0.71      0.72      0.71      0.51     20445



In [22]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.18966142870366062, 'ap_hi_status_encoded_3'),
 (0.11709483579706241, 'ap_lo_status_encoded_2'),
 (0.07792652478784921, 'ap_hi_status_encoded_1'),
 (0.0769308610046962, 'ap_hi_status_encoded_0'),
 (0.04961109972711361, 'cholesterol_3'),
 (0.046404774640934635, 'age_group_encoded_3'),
 (0.04009648784223468, 'ap_lo_status_encoded_0'),
 (0.03917698635340856, 'cholesterol_1'),
 (0.033635365105645514, 'age_group_encoded_1'),
 (0.030720451239472507, 'ap_lo_status_encoded_1'),
 (0.030694817197121742, 'ap_hi_status_encoded_2'),
 (0.02380581910963005, 'bmi_status_encoded_3'),
 (0.02070447171289227, 'age_group_encoded_2'),
 (0.020588242167597443, 'bmi_status_encoded_1'),
 (0.018028157036709302, 'gluc_1'),
 (0.01765305409070786, 'gender_2'),
 (0.017614757111882366, 'gender_1'),
 (0.017201967396932676, 'bmi_status_encoded_2'),
 (0.015694319907897738, 'active_0'),
 (0.015663933915573726, 'active_1'),
 (0.015132897420691366, 'cholesterol_2'),
 (0.013929391181180398, 'gluc_2'),
 (0.013617426697097

## SMOTEENN

In [23]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)

# Fitting the model
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6833, 1: 9121})

In [24]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [25]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTEENN predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTEENN predictive accuracy: 0.725


In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,7912,2514
Actual Cardio Absence,3106,6913


In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.76      0.69      0.74      0.72      0.53     10426
          1       0.73      0.69      0.76      0.71      0.72      0.52     10019

avg / total       0.73      0.73      0.72      0.72      0.72      0.52     20445



## SMOTE Oversampling

In [28]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)

# Fitting the model
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 24327, 0: 24327})

In [29]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [30]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTE predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTE predictive accuracy: 0.723


In [31]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,8303,2123
Actual Cardio Absence,3537,6482


In [32]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.65      0.75      0.72      0.52     10426
          1       0.75      0.65      0.80      0.70      0.72      0.51     10019

avg / total       0.73      0.72      0.72      0.72      0.72      0.52     20445



## EasyEnsembleClassifier

In [33]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=150, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)
Counter(y_train)

Counter({1: 23375, 0: 24327})

In [34]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(f" Easy Ensemble predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Easy Ensemble predictive accuracy: 0.722


In [35]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,8384,2042
Actual Cardio Absence,3634,6385


In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.64      0.75      0.72      0.52     10426
          1       0.76      0.64      0.80      0.69      0.72      0.50     10019

avg / total       0.73      0.72      0.72      0.72      0.72      0.51     20445

