In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.svm import SVC
import plotly.graph_objects as go 
from sklearn.ensemble import RandomForestClassifier


RANDOM_STATE = 75

In [3]:
# loading data
df = pd.read_csv("CAD Database.csv")

df.head()

Unnamed: 0,Age,Weight,Height,Sex,BMI,Diabetes Mellitus,Hypertension,Current Smoker,EX-Smoker,Obesity,...,Left Ventricular Hypertrophy,Low Density Lipoprotein,High Density Lipoprotein,Ejection Fraction,Regional Wall Motion Abnormalities,Valvular Heart Disease,Left Anterior Descending,Left Circumflex,Right Coronary Artery,Result
0,53,90,175,Male,29.387755,0,1,1,0,Y,...,N,155,30.0,50,0,N,Stenotic,Normal,Stenotic,CAD
1,67,70,157,Fmale,28.398718,0,1,0,0,Y,...,N,121,36.0,40,4,N,Stenotic,Stenotic,Normal,CAD
2,54,54,164,Male,20.077335,0,0,1,0,N,...,N,70,45.0,40,2,mild,Stenotic,Normal,Normal,CAD
3,66,67,158,Fmale,26.838648,0,1,0,0,Y,...,N,55,27.0,55,0,Severe,Normal,Normal,Normal,Normal
4,50,87,153,Fmale,37.165193,0,1,0,0,Y,...,N,110,50.0,50,0,Severe,Normal,Normal,Normal,Normal


In [4]:
# selecting required input features
df = df[['Age', 'Sex', 'Diabetes Mellitus', 'Hypertension', 'Current Smoker', 'EX-Smoker', 'Obesity', 'Typical Chest Pain','Left Ventricular Hypertrophy', 'Ejection Fraction', 'Regional Wall Motion Abnormalities', 'Valvular Heart Disease','Result']]

In [5]:
# selecting categorical columns for one hot encoding
cat_variables = ['Sex', 'Obesity', 'Left Ventricular Hypertrophy', 'Valvular Heart Disease']

In [6]:
# performing one hot encoding on categorical columns
df = pd.get_dummies(data = df,columns = cat_variables)

In [7]:
# seeing the data after one hot encoding
df

Unnamed: 0,Age,Diabetes Mellitus,Hypertension,Current Smoker,EX-Smoker,Typical Chest Pain,Ejection Fraction,Regional Wall Motion Abnormalities,Result,Sex_Fmale,Sex_Male,Obesity_N,Obesity_Y,Left Ventricular Hypertrophy_N,Left Ventricular Hypertrophy_Y,Valvular Heart Disease_Moderate,Valvular Heart Disease_N,Valvular Heart Disease_Severe,Valvular Heart Disease_mild
0,53,0,1,1,0,0,50,0,CAD,False,True,False,True,True,False,False,True,False,False
1,67,0,1,0,0,1,40,4,CAD,True,False,False,True,True,False,False,True,False,False
2,54,0,0,1,0,1,40,2,CAD,False,True,True,False,True,False,False,False,False,True
3,66,0,1,0,0,0,55,0,Normal,True,False,False,True,True,False,False,False,True,False
4,50,0,1,0,0,0,50,0,Normal,True,False,False,True,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,58,0,0,0,0,1,45,0,CAD,False,True,False,True,True,False,False,True,False,False
299,55,0,0,0,0,0,40,0,Normal,True,False,False,True,True,False,False,False,False,True
300,48,0,1,0,0,0,55,0,Normal,True,False,False,True,True,False,False,True,False,False
301,57,1,0,0,0,0,55,0,Normal,True,False,False,True,True,False,False,True,False,False


In [8]:
# selecting all the other features than Result (excluding target variable)
features = [x for x in df.columns if x not in 'Result']

In [9]:
# converting all features to int datatype
df[features] = df[features].astype(int)

In [10]:
# mapping CAD Result to 1 and Normal Result to 0
df['Result'] = df['Result'].map({"CAD": 1, "Normal": 0})

# XGBoost

In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(df[features], df['Result'], train_size=0.7, random_state=RANDOM_STATE)
X_eval, X_test, y_eval, y_test = train_test_split(X_temp, y_temp, train_size=0.5, random_state=RANDOM_STATE)

In [12]:
xgb_model = XGBClassifier(n_estimators=400, learning_rate=0.01, max_depth=3, reg_alpha=0.1, reg_lambda=0.1, early_stopping_rounds=30, verbosity=1, random_state=RANDOM_STATE)
xgb_model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], verbose=True)


[0]	validation_0-logloss:0.63334
[1]	validation_0-logloss:0.62875
[2]	validation_0-logloss:0.62427
[3]	validation_0-logloss:0.61983
[4]	validation_0-logloss:0.61556
[5]	validation_0-logloss:0.61123
[6]	validation_0-logloss:0.60709
[7]	validation_0-logloss:0.60295
[8]	validation_0-logloss:0.59866
[9]	validation_0-logloss:0.59471
[10]	validation_0-logloss:0.59058
[11]	validation_0-logloss:0.58671
[12]	validation_0-logloss:0.58307
[13]	validation_0-logloss:0.57904
[14]	validation_0-logloss:0.57661
[15]	validation_0-logloss:0.57260
[16]	validation_0-logloss:0.57013
[17]	validation_0-logloss:0.56783
[18]	validation_0-logloss:0.56389
[19]	validation_0-logloss:0.56175
[20]	validation_0-logloss:0.55793
[21]	validation_0-logloss:0.55495
[22]	validation_0-logloss:0.55273
[23]	validation_0-logloss:0.54923
[24]	validation_0-logloss:0.54620
[25]	validation_0-logloss:0.54342
[26]	validation_0-logloss:0.54134
[27]	validation_0-logloss:0.53812
[28]	validation_0-logloss:0.53542
[29]	validation_0-loglos

In [13]:
y_score = xgb_model.predict_proba(X_test)[:, 1]

In [14]:
print(f"Metrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_test), y_test):.4f}")

Metrics test:
	Accuracy score: 0.8913


In [15]:
fpr_xgb, tpr_xgb, thresholds = roc_curve(y_test, y_score)
auc_xgb = roc_auc_score(y_test, y_score)

# Logistic Regression

In [16]:
# training logistic regression model
classifier = LogisticRegression(max_iter=650)
classifier.fit(X_train, y_train)

In [17]:
y_pred_proba = classifier.predict_proba(X_test)[:, 1]

In [18]:
print(f"Metrics test:\n\tAccuracy score: {accuracy_score(classifier.predict(X_test), y_test):.4f}")

Metrics test:
	Accuracy score: 0.8478


In [19]:
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_pred_proba)
auc_lr = roc_auc_score(y_test, y_pred_proba)

# SVM 

In [20]:
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [21]:
svm_classifier = SVC(kernel='linear', C=1.0, random_state=RANDOM_STATE)
svm_classifier.fit(X_train_normalized, y_train)

In [22]:
y_pred_proba_svm = svm_classifier.decision_function(X_test_normalized)

In [23]:
print(f"SVM Metrics test:\n\tAccuracy score: {accuracy_score(svm_classifier.predict(X_test_normalized), y_test):.4f}")

SVM Metrics test:
	Accuracy score: 0.8478


In [24]:
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_pred_proba_svm)
auc_svm = roc_auc_score(y_test, y_pred_proba_svm)

# ROC Curve


In [25]:
trace_xgb = go.Scatter(
    x=fpr_xgb,
    y=tpr_xgb,
    mode='lines',
    name=f'XGB (Area = {auc_xgb:.2f})'
)

In [26]:
trace_lr = go.Scatter(
    x=fpr_lr,
    y=tpr_lr,
    mode='lines',
    name=f'Logistic Regression (Area = {auc_lr:.2f})'
)


In [27]:
trace_svm = go.Scatter(
    x=fpr_svm,
    y=tpr_svm,
    mode='lines',
    name=f'SVM (Area = {auc_svm:.2f})'
)


In [28]:
# Diagonal line
trace_random = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

In [29]:
data = [trace_xgb ,trace_lr, trace_svm, trace_random]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()