In [118]:
import warnings
warnings.filterwarnings('ignore')

In [119]:
# import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the Cardio CSV

In [120]:
# Load the data
file_path = Path('Resources/cardio_train.csv')
cardio_df = pd.read_csv(file_path, sep=';')

cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [121]:
# convert age to years
cardio_df['age_year'] = (cardio_df['age']/365).round(0)
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.0
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52.0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.0
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48.0


In [122]:
# Create age ranges
size_bins = [0,40,49,59,90]
group_names = ["<40","40-49", "50-59", "60-89"]
cardio_df["age_group"] = pd.cut(cardio_df["age_year"], size_bins, labels = group_names)

In [123]:
# encode the age groups
cardio_df['age_group_encoded'] = cardio_df['age_group'].map( {"<40":0, "40-49":1, "50-59":2, "60-89":3})

In [124]:
#converting height into meters
height_meters = cardio_df["height"]/100
# calculate the bmi
cardio_df["bmi"] = round(cardio_df["weight"]/(height_meters**2),2)
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group,age_group_encoded,bmi
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.0,50-59,2,21.97
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.0,50-59,2,34.93
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52.0,50-59,2,23.51
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.0,40-49,1,28.71
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48.0,40-49,1,23.01


In [125]:
# Create bmi health status
size_bins = [0,18.4,24.9,29.9,300]
group_names = ["underweight","healthy","overweight","obese"]
cardio_df["bmi_status"] = pd.cut(cardio_df["bmi"], size_bins, labels = group_names)

# encode the bmi status
cardio_df['bmi_status_encoded'] = cardio_df['bmi_status'].map( {"underweight":0, "healthy":1, "overweight":2, "obese":3})

In [126]:
# Ranges based on https://www.healthline.com/health/high-blood-pressure-hypertension/blood-pressure-reading-explained
# Create ap_hi ranges
size_bins = [-150,119,129,139,179,16020]
group_names = ["normal","elevated","high_blood_pressure_1","high_blood_pressure_2","hypertensive_crisis"]
cardio_df["ap_hi_status"] = pd.cut(cardio_df["ap_hi"], size_bins, labels = group_names)

# encode the ap_hi_status
cardio_df['ap_hi_status_encoded'] = cardio_df['ap_hi_status'].map( {"normal":0, "elevated":1, "high_blood_pressure_1":2, "high_blood_pressure_2":3,"hypertensive_crisis":4})

In [127]:
# Ranges based on https://www.healthline.com/health/high-blood-pressure-hypertension/blood-pressure-reading-explained
# Create ap_hi ranges
size_bins = [-70,79,89,119,11000]
group_names = ["normal","high_blood_pressure_1","high_blood_pressure_2","hypertensive_crisis"]
cardio_df["ap_lo_status"] = pd.cut(cardio_df["ap_lo"], size_bins, labels = group_names)

# encode the ap_lo_status
cardio_df['ap_lo_status_encoded'] = cardio_df['ap_lo_status'].map( {"normal":0,"high_blood_pressure_1":1, "high_blood_pressure_2":2,"hypertensive_crisis":3})

# Split the Data into Training and Testing

In [129]:
# drop the irrelevent columns from the dataframe
cardio_train_df = cardio_df.drop(columns=['id','age','age_group','height','weight','bmi','bmi_status','age_year','ap_hi','ap_hi_status','ap_lo','ap_lo_status'])
cardio_train_df.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group_encoded,bmi_status_encoded,ap_hi_status_encoded,ap_lo_status_encoded
0,2,1,1,0,0,1,0,2,1,0,1
1,1,3,1,0,0,1,1,2,3,3,2
2,1,3,1,0,0,0,1,2,1,2,0
3,2,1,1,0,0,1,1,1,2,3,2
4,1,1,1,0,0,0,0,1,1,0,0


In [130]:
cardio_train_df.describe()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,1.349571,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,0.476838,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,2.0,2.0,1.0,0.0,0.0,1.0,1.0
max,2.0,3.0,3.0,1.0,1.0,1.0,1.0


In [131]:
# Create our features
X = pd.get_dummies(cardio_train_df, columns=['age_group_encoded','gender', 'bmi_status_encoded', 'ap_hi_status_encoded', 
                'ap_lo_status_encoded', 'cholesterol', 'gluc', 'smoke', 
                'alco','active']).drop('cardio', axis=1)

# Create our target
y = cardio_train_df['cardio']

In [132]:
X.describe()

Unnamed: 0,age_group_encoded_0,age_group_encoded_1,age_group_encoded_2,age_group_encoded_3,gender_1,gender_2,bmi_status_encoded_0,bmi_status_encoded_1,bmi_status_encoded_2,bmi_status_encoded_3,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,...,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,0.044471,0.223371,0.505014,0.227143,0.650429,0.349571,0.008414,0.354414,0.370843,0.266329,...,0.115229,0.8497,0.074143,0.076157,0.911871,0.088129,0.946229,0.053771,0.196271,0.803729
std,0.206142,0.416508,0.499978,0.418989,0.476838,0.476838,0.091343,0.478339,0.483034,0.442041,...,0.3193,0.357368,0.262005,0.265251,0.283484,0.283484,0.225568,0.225568,0.397179,0.397179
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [133]:
# Check the balance of our target values
y.value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Random Forest Classifier

In [135]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=130, random_state=1)

# Fitting the model
brf_model.fit(X_train, y_train)
Counter(y_train)

Counter({1: 26234, 0: 26266})

In [136]:
# Evaluate the model
y_pred = brf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.722


In [137]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Presence", "Actual Absence"], columns=["Predicted Presence", "Predicted Absence"])
cm_df

Unnamed: 0,Predicted Presence,Predicted Absence
Actual Presence,6776,1979
Actual Absence,2882,5863


In [138]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.77      0.67      0.74      0.72      0.52      8755
          1       0.75      0.67      0.77      0.71      0.72      0.51      8745

avg / total       0.72      0.72      0.72      0.72      0.72      0.52     17500



In [139]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.18330702494794487, 'ap_hi_status_encoded_3'),
 (0.09477849592682569, 'ap_hi_status_encoded_1'),
 (0.09281672992330042, 'ap_lo_status_encoded_2'),
 (0.08300935386570077, 'ap_hi_status_encoded_0'),
 (0.04840518112762178, 'ap_lo_status_encoded_0'),
 (0.045469843016149786, 'age_group_encoded_3'),
 (0.04508993537532908, 'cholesterol_3'),
 (0.03746649874181729, 'cholesterol_1'),
 (0.028932354608138856, 'ap_hi_status_encoded_2'),
 (0.02626656950217542, 'ap_lo_status_encoded_1'),
 (0.02328493957834044, 'bmi_status_encoded_3'),
 (0.022502944878244675, 'age_group_encoded_1'),
 (0.02145743873195643, 'age_group_encoded_2'),
 (0.020682051922172463, 'bmi_status_encoded_1'),
 (0.017324201862100496, 'gluc_1'),
 (0.016946020998463283, 'bmi_status_encoded_2'),
 (0.01662931212769141, 'gender_1'),
 (0.016570134270028805, 'age_group_encoded_0'),
 (0.016414653453813637, 'gender_2'),
 (0.016380321373224444, 'active_0'),
 (0.015980656839801453, 'active_1'),
 (0.013717412401231538, 'gluc_2'),
 (0.013508137

## SMOTEENN

In [140]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)

# Fitting the model
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 9958, 1: 9393})

In [141]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [142]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTEENN predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTEENN predictive accuracy: 0.731


In [143]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Presence", "Actual Absence"], columns=["Predicted Presence", "Predicted Absence"])
cm_df

Unnamed: 0,Predicted Presence,Predicted Absence
Actual Presence,7030,1725
Actual Absence,2975,5770


In [144]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.66      0.75      0.73      0.54      8755
          1       0.77      0.66      0.80      0.71      0.73      0.52      8745

avg / total       0.74      0.73      0.73      0.73      0.73      0.53     17500



## SMOTE Oversampling

In [145]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)

# Fitting the model
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 26266, 0: 26266})

In [146]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [147]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTE predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTE predictive accuracy: 0.731


In [148]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Presence", "Actual Absence"], columns=["Predicted Presence", "Predicted Absence"])
cm_df

Unnamed: 0,Predicted Presence,Predicted Absence
Actual Presence,7101,1654
Actual Absence,3053,5692


In [149]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.81      0.65      0.75      0.73      0.54      8755
          1       0.77      0.65      0.81      0.71      0.73      0.52      8745

avg / total       0.74      0.73      0.73      0.73      0.73      0.53     17500



## EasyEnsembleClassifier

In [150]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)
Counter(y_resampled)

KeyboardInterrupt: 

In [None]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(f" Easy Ensemble predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Presence", "Actual Absence"], columns=["Predicted Presence", "Predicted Absence"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))