In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [21]:
# Load the data
import pickle
with open('../Data/cleanedDF.pkl', 'rb') as file:
    data = pickle.load(file)

df = data.copy()

print(df.head())

   Patient_ID  Age  Blood_Pressure  CVD_Risk_Score  Gender_Male  \
0           1   55             104              78            1   
1           2   66             142              49            0   
2           3   69             176              31            0   
3           4   45             178              23            0   
4           5   39             146              79            0   

   Smoking_Status_Smoker  Cholesterol_Level_Low  Cholesterol_Level_Normal  \
0                      0                      0                         1   
1                      1                      1                         0   
2                      1                      1                         0   
3                      1                      0                         1   
4                      1                      0                         1   

   Air_Pollution_Exposure_Low  Air_Pollution_Exposure_Medium  ...  \
0                           0                              0  ...

In [22]:
# Prepare the data
X = df.drop('Heart_Attack_Yes', axis=1)
y = df['Heart_Attack_Yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [23]:
# Random Forrest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Evaluating models
rf_predictions = rf_classifier.predict(X_test)
print("Accuracy of Random Forrest: ", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

dt_predictions = dt_classifier.predict(X_test)
print("Accuracy of Decision Tree: ", accuracy_score(y_test, dt_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_predictions))
print("Classification Report:\n", classification_report(y_test, dt_predictions))



Accuracy of Random Forrest:  0.8784636603000794
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94     42038
           1       0.00      0.00      0.00      5816

    accuracy                           0.88     47854
   macro avg       0.44      0.50      0.47     47854
weighted avg       0.77      0.88      0.82     47854

Accuracy of Decision Tree:  0.7649517281731935
Confusion Matrix:
 [[35753  6285]
 [ 4963   853]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.85      0.86     42038
           1       0.12      0.15      0.13      5816

    accuracy                           0.76     47854
   macro avg       0.50      0.50      0.50     47854
weighted avg       0.79      0.76      0.78     47854



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Now lets check which of the features are the most important for the Random Forrest Classifier
rf_feature_importances = rf_classifier.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': rf_feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances (Random Forest):")
print(feature_importance_df)


Feature Importances (Random Forest):
                            Feature  Importance
0                        Patient_ID    0.113505
3                    CVD_Risk_Score    0.099701
2                    Blood_Pressure    0.099033
1                               Age    0.094600
4                       Gender_Male    0.018678
44                      TCM_Use_Yes    0.018124
5             Smoking_Status_Smoker    0.017628
18             Rural_or_Urban_Urban    0.017224
42          Alcohol_Consumption_Yes    0.016323
30        Hospital_Availability_Low    0.016140
31     Hospital_Availability_Medium    0.016013
9     Air_Pollution_Exposure_Medium    0.015975
43           Family_History_CVD_Yes    0.015758
12              Diet_Score_Moderate    0.015721
39                 Hypertension_Yes    0.015692
14                 Stress_Level_Low    0.015441
10            Physical_Activity_Low    0.015389
7          Cholesterol_Level_Normal    0.015301
13                  Diet_Score_Poor    0.015187
15