In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
# import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the Cardio CSV

In [37]:
# Load the data
file_path = Path('Resources/cardio_train.csv')
cardio_df = pd.read_csv(file_path, sep=';')

#cardio_df.head()

In [None]:
#cardio_df.describe()

In [38]:
# convert age to years
cardio_df['age_year'] = (cardio_df['age']/365).round(0)

In [39]:
# Remove outliers in ap_hi
q_hi  = cardio_df["ap_hi"].quantile(0.99)

df_filtered = cardio_df[(cardio_df["ap_hi"] < q_hi)]

In [40]:
# Remove outliers in ap_lo
q_hi  = cardio_df["ap_lo"].quantile(0.99)

cardio_filtered = df_filtered[(df_filtered["ap_lo"] < q_hi)]

In [41]:
# Create age groups
size_bins = [0,39,49,59,90]
group_names = ["<40","40-49", "50-59", "60-89"]
cardio_filtered["age_group"] = pd.cut(cardio_filtered["age_year"], size_bins, labels = group_names)

# encode age groups
cardio_filtered['age_group_encoded'] = cardio_filtered['age_group'].map( {"<40":0, "40-49":1, "50-59":2, "60-89":3})

In [42]:
#converting height into meters
height_meters = cardio_filtered["height"]/100

# calculate the bmi
cardio_filtered["bmi"] = round(cardio_filtered["weight"]/(height_meters**2),2)

# Create bmi health status
size_bins = [0,18.4,24.9,29.9,300]
group_names = ["underweight","healthy","overweight","obese"]
cardio_filtered["bmi_status"] = pd.cut(cardio_filtered["bmi"], size_bins, labels = group_names)

# encode the bmi status
cardio_filtered['bmi_status_encoded'] = cardio_filtered['bmi_status'].map( {"underweight":0, "healthy":1, "overweight":2, "obese":3})

In [43]:
# Ranges based on https://www.healthline.com/health/high-blood-pressure-hypertension/blood-pressure-reading-explained
# Create ap_hi ranges
size_bins = [0,119,129,139,179]
group_names = ["normal","elevated","high_blood_pressure_1","high_blood_pressure_2"]
cardio_filtered["ap_hi_status"] = pd.cut(cardio_filtered["ap_hi"], size_bins, labels = group_names)

# encode the ap_hi_status
cardio_filtered['ap_hi_status_encoded'] = cardio_filtered['ap_hi_status'].map( {"normal":0, "elevated":1, "high_blood_pressure_1":2, "high_blood_pressure_2":3})

# Create ap_lo ranges
size_bins = [0,79,89,119,910]
group_names = ["normal","high_blood_pressure_1","high_blood_pressure_2","hypertensive_crisis"]
cardio_filtered["ap_lo_status"] = pd.cut(cardio_filtered["ap_lo"], size_bins, labels = group_names)

# encode the ap_lo_status
cardio_filtered['ap_lo_status_encoded'] = cardio_filtered['ap_lo_status'].map( {"normal":0,"high_blood_pressure_1":1, "high_blood_pressure_2":2,"hypertensive_crisis":3})

In [None]:
#cardio_filtered.describe()

# Split the Data into Training and Testing

In [44]:
# drop the irrelevent columns from the dataframe
cardio_train_df = cardio_filtered.drop(columns=['id','age','age_group','height','weight','bmi','bmi_status','age_year','ap_hi','ap_hi_status','ap_lo','ap_lo_status'])
cardio_train_df.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group_encoded,bmi_status_encoded,ap_hi_status_encoded,ap_lo_status_encoded
0,2,1,1,0,0,1,0,2,1,0,1
1,1,3,1,0,0,1,1,2,3,3,2
2,1,3,1,0,0,0,1,2,1,2,0
3,2,1,1,0,0,1,1,1,2,3,2
4,1,1,1,0,0,0,0,1,1,0,0


In [None]:
#cardio_train_df.describe()

In [45]:
# Create our features
X = pd.get_dummies(cardio_train_df, columns=['age_group_encoded','gender', 'bmi_status_encoded', 'ap_hi_status_encoded', 
                'ap_lo_status_encoded', 'cholesterol', 'gluc', 'smoke', 
                'alco','active']).drop('cardio', axis=1)

# Create our target
y = cardio_train_df['cardio']

In [None]:
#X.describe()

In [46]:
# Check the balance of our target values
y.value_counts()

0    34753
1    33394
Name: cardio, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.7, test_size=0.3, stratify=y)

## Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=150, random_state=1)

# Fitting the model
brf_model.fit(X_train, y_train)
Counter(y_train)

In [None]:
# Evaluate the model
y_pred = brf_model.predict(X_test)
print(f" Balanced Random Forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

## SMOTEENN

In [48]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)

# Fitting the model
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6833, 1: 9121})

In [49]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [50]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTEENN predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTEENN predictive accuracy: 0.725


In [51]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,7912,2514
Actual Cardio Absence,3106,6913


In [52]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.76      0.69      0.74      0.72      0.53     10426
          1       0.73      0.69      0.76      0.71      0.72      0.52     10019

avg / total       0.73      0.73      0.72      0.72      0.72      0.52     20445



## SMOTE Oversampling

In [53]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)

# Fitting the model
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 24327, 0: 24327})

In [54]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [58]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTE Oversampling predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTE Oversampling predictive accuracy: 0.723


In [56]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,8303,2123
Actual Cardio Absence,3537,6482


In [57]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.65      0.75      0.72      0.52     10426
          1       0.75      0.65      0.80      0.70      0.72      0.51     10019

avg / total       0.73      0.72      0.72      0.72      0.72      0.52     20445



## Easy Ensemble Classifier

In [65]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=150, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)
Counter(y_train)

Counter({1: 23375, 0: 24327})

In [60]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(f" Easy Ensemble predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Easy Ensemble predictive accuracy: 0.722


In [61]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
     cm, index=["Actual Cardio Presence", "Actual Cardio Absence"], columns=["Predicted Cardio Presence", "Predicted Cardio Absence"])
cm_df

Unnamed: 0,Predicted Cardio Presence,Predicted Cardio Absence
Actual Cardio Presence,8384,2042
Actual Cardio Absence,3634,6385


In [62]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.64      0.75      0.72      0.52     10426
          1       0.76      0.64      0.80      0.69      0.72      0.50     10019

avg / total       0.73      0.72      0.72      0.72      0.72      0.51     20445

