In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the Cardio CSV

In [3]:
# Load the data
file_path = Path('Resources/cardio_train.csv')
cardio_df = pd.read_csv(file_path, sep=';')

cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# convert age to years
cardio_df['age_year'] = (cardio_df['age']/365).round(0)
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.0
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52.0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.0
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48.0


# Split the Data into Training and Testing

In [88]:
# drop the irrelevent columns from the dataframe
cardio_train_df = cardio_df.drop(columns=['id','age'])
cardio_train_df.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,year
0,2,168,62.0,110,80,1,1,0,0,1,0,50.0
1,1,156,85.0,140,90,3,1,0,0,1,1,55.0
2,1,165,64.0,130,70,3,1,0,0,0,1,52.0
3,2,169,82.0,150,100,1,1,0,0,1,1,48.0
4,1,156,56.0,100,60,1,1,0,0,0,0,48.0


In [89]:
cardio_train_df.describe()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,year
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997,53.338686
std,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003,6.765294
min,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,30.0
25%,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0
50%,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0
75%,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,58.0
max,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0


In [90]:
# Create our features
X = pd.get_dummies(cardio_train_df, columns=['year','gender', 'height', 'weight', 'ap_hi', 
                'ap_lo', 'cholesterol', 'gluc', 'smoke', 
                'alco','active']).drop('cardio', axis=1)

# Create our target
y = cardio_train_df['cardio']

In [91]:
X.describe()

Unnamed: 0,year_30.0,year_39.0,year_40.0,year_41.0,year_42.0,year_43.0,year_44.0,year_45.0,year_46.0,year_47.0,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,...,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,5.7e-05,0.005843,0.038571,0.010943,0.037486,0.009857,0.040529,0.010814,0.042214,0.010771,...,0.115229,0.8497,0.074143,0.076157,0.911871,0.088129,0.946229,0.053771,0.196271,0.803729
std,0.007559,0.076215,0.192573,0.104035,0.18995,0.098793,0.197197,0.103429,0.201079,0.103226,...,0.3193,0.357368,0.262005,0.265251,0.283484,0.283484,0.225568,0.225568,0.397179,0.397179
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [92]:
# Check the balance of our target values
y.value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Random Forest Classifier

In [94]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.714


In [97]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,6456,2299
Actual low_risk,2702,6043


In [98]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.74      0.69      0.72      0.71      0.51      8755
          1       0.72      0.69      0.74      0.71      0.71      0.51      8745

avg / total       0.71      0.71      0.71      0.71      0.71      0.51     17500



In [99]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.032322971077620444, 'ap_hi_112'),
 (0.03188505309811998, 'ap_hi_80'),
 (0.02827068373571232, 'ap_lo_62'),
 (0.02221367231412267, 'ap_hi_12'),
 (0.01982128971771378, 'ap_hi_122'),
 (0.01813216498502293, 'ap_lo_1111'),
 (0.017817993878650833, 'ap_lo_8'),
 (0.015617029397488482, 'ap_lo_1125'),
 (0.014595348727965229, 'ap_lo_72'),
 (0.014188457764757604, 'ap_lo_52'),
 (0.013563738016501323, 'year_30.0'),
 (0.013503458469958828, 'year_39.0'),
 (0.012987255885133489, 'height_137'),
 (0.012581179287551415, 'ap_hi_132'),
 (0.011612923412566112, 'height_132'),
 (0.011078493947518992, 'height_142'),
 (0.011042937754700157, 'active_0'),
 (0.010999812454101048, 'weight_63.82'),
 (0.010900800799339644, 'height_140'),
 (0.01046924041334626, 'weight_59.2'),
 (0.010374193080882188, 'cholesterol_1'),
 (0.010095239192819875, 'ap_hi_102'),
 (0.009830007916219501, 'cholesterol_3'),
 (0.009427670371419781, 'height_136'),
 (0.009190454511934032, 'ap_lo_10000'),
 (0.009011669981033359, 'weight_200.0'),
 

## SMOTEENN

In [101]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 8748, 1: 8218})

In [104]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [106]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTEENN predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTEENN predictive accuracy: 0.729


In [107]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,7036,1719
Actual low_risk,3018,5727


In [108]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.80      0.65      0.75      0.73      0.53      8755
          1       0.77      0.65      0.80      0.71      0.73      0.52      8745

avg / total       0.73      0.73      0.73      0.73      0.73      0.53     17500



## SMOTE Oversampling

In [110]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 26266, 0: 26266})

In [111]:
# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [112]:
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test)
print(f" SMOTE predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SMOTE predictive accuracy: 0.733


In [113]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,7105,1650
Actual low_risk,3021,5724


In [114]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.81      0.65      0.75      0.73      0.54      8755
          1       0.78      0.65      0.81      0.71      0.73      0.52      8745

avg / total       0.74      0.73      0.73      0.73      0.73      0.53     17500

