In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree
from pathlib import Path
import xgboost as xgb
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv("../Project-4/resources/heart_data.csv")

In [3]:
df[['Height_(cm)','Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption','FriedPotato_Consumption']] = df[['Height_(cm)','Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption','FriedPotato_Consumption']].astype('int64')


In [4]:
df.drop(df[df['Diabetes'] == "Yes, but female told only during pregnancy"].index, inplace = True)
df.drop(df[df['Diabetes'] == "No, pre-diabetes or borderline diabetes"].index, inplace = True)

In [5]:
df.replace('Yes',1,inplace=True)
df.replace('No',0,inplace=True)


In [6]:
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,0,0,0,0,0,0,1,Female,70-74,150,32.66,14.54,1,0,30,16,12
1,Very Good,Within the past year,0,1,0,0,0,1,0,Female,70-74,165,77.11,28.29,0,0,30,0,4
2,Very Good,Within the past year,1,0,0,0,0,1,0,Female,60-64,163,88.45,33.47,0,4,12,3,16
3,Poor,Within the past year,1,1,0,0,0,1,0,Male,75-79,180,93.44,28.73,0,0,30,30,8
4,Good,Within the past year,0,0,0,0,0,0,0,Male,80+,191,88.45,24.37,1,0,8,4,0


In [7]:
features_df = df.drop(['Heart_Disease','Checkup','General_Health'],axis=1)

In [8]:
features_df = pd.get_dummies(features_df,columns=['Age_Category','Sex'])

In [9]:
features_df.head()

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Height_(cm),Weight_(kg),BMI,Smoking_History,...,Age_Category_45-49,Age_Category_50-54,Age_Category_55-59,Age_Category_60-64,Age_Category_65-69,Age_Category_70-74,Age_Category_75-79,Age_Category_80+,Sex_Female,Sex_Male
0,0,0,0,0,0,1,150,32.66,14.54,1,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,1,0,165,77.11,28.29,0,...,0,0,0,0,0,1,0,0,1,0
2,1,0,0,0,1,0,163,88.45,33.47,0,...,0,0,0,1,0,0,0,0,1,0
3,1,0,0,0,1,0,180,93.44,28.73,0,...,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,0,191,88.45,24.37,1,...,0,0,0,0,0,0,0,1,0,1


In [10]:
target_df = df['Heart_Disease'].ravel()
target_df[:5]

array([0, 1, 0, 1, 0])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, random_state=78)

In [12]:
scaler=StandardScaler()

In [13]:
X_scaler = scaler.fit(X_train)

In [14]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

XG Boost Model

In [15]:
#setup the xgBoost model
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)

dtest = xgb.DMatrix(X_test_scaled, label = y_test)

In [16]:
#define the parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.01,
    'max_depth': 8,
    'subsample': 0.75,
    'scale_pos_weight': 4
}

In [17]:
#define number of test times
num_rounds = 1000
model = xgb.train(params, dtrain, num_rounds)

In [18]:
#setup y_prediction
y_pred = model.predict(dtest)

In [19]:
#set the y_prediction to binary
y_pred_bin = [1 if pred > 0.5 else 0 for pred in y_pred]


In [20]:
#display the accuracy and confusion matrix and classification report

xgbaccuracy = accuracy_score(y_test, y_pred_bin)
conf_matrix = confusion_matrix(y_test, y_pred_bin)
classification_rep = classification_report(y_test, y_pred_bin)

print(f"Accuracy: {xgbaccuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report")
print(classification_rep)

Accuracy: 0.8756

Confusion Matrix:
[[63199  5661]
 [ 3649  2319]]

Classification Report
              precision    recall  f1-score   support

           0       0.95      0.92      0.93     68860
           1       0.29      0.39      0.33      5968

    accuracy                           0.88     74828
   macro avg       0.62      0.65      0.63     74828
weighted avg       0.89      0.88      0.88     74828

