# Comparing Four Machine Learning Models and an Ensemble for Predicting Heart Attacks from Routine Clinical Features

## Variable Breakdown:

*FILL IN*

## Importing Necessary Libraries

In [8]:
## Import is taken from Assignment 3, as it uses XGBoost, DTs, and NN. Should have all the imports we need.
#!pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
)
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
precision_score,
recall_score,
f1_score,      
average_precision_score,
confusion_matrix,
classification_report
)
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import torch.nn.utils as utils


seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Load Dataset

In [9]:
df = pd.read_csv('Medicaldataset.csv')

df.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,64,1,66,160,83,160.0,1.8,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.06,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative


In [10]:
df.tail()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
1314,44,1,94,122,67,204.0,1.63,0.006,negative
1315,66,1,84,125,55,149.0,1.33,0.172,positive
1316,45,1,85,168,104,96.0,1.24,4.25,positive
1317,54,1,58,117,68,443.0,5.8,0.359,positive
1318,51,1,94,157,79,134.0,50.89,1.77,positive


## Dataset Exploration

### Checking for Null Values

In [11]:
df.isnull().sum()

Age                         0
Gender                      0
Heart rate                  0
Systolic blood pressure     0
Diastolic blood pressure    0
Blood sugar                 0
CK-MB                       0
Troponin                    0
Result                      0
dtype: int64

### Checking Target Class Imbalances

In [12]:
res_groups = df.groupby('Result')

print(f'Group Totals\nNegative: {res_groups.size().iloc[0]}\nPositive: {res_groups.size().iloc[1]}\n')
print(f'Group Percents\nNegative: {res_groups.size().iloc[0] / df["Result"].shape[0] * 100 : 0.3f}\nPositive: {res_groups.size().iloc[1] / df["Result"].shape[0] * 100 : 0.3f}')

Group Totals
Negative: 509
Positive: 810

Group Percents
Negative:  38.590
Positive:  61.410


### Analyzing Variable Means

In [13]:
print(f'Mean Age: {df["Age"].mean() : 0.3f}')
print(f'Mean Heart Rate: {df["Heart rate"].mean() : 0.3f}')
print(f'Mean Systolic Blood Pressure: {df["Systolic blood pressure"].mean() : 0.3f}')
print(f'Mean Diastolic Blood Pressure: {df["Diastolic blood pressure"].mean() : 0.3f}')
print(f'Mean Blood Sugar Levels: {df["Blood sugar"].mean() : 0.3f}')
print(f'Mean CK-MB: {df["CK-MB"].mean() : 0.3f}')
print(f'Mean Troponin: {df["Troponin"].mean() : 0.3f}')

Mean Age:  56.192
Mean Heart Rate:  78.337
Mean Systolic Blood Pressure:  127.171
Mean Diastolic Blood Pressure:  72.269
Mean Blood Sugar Levels:  146.634
Mean CK-MB:  15.274
Mean Troponin:  0.361


### Checking Gender Variable Imbalances

### Checking Correlation Between Variables

## Data Split and Scaling

### Assign X and y variables

In [14]:
X = df.drop(columns=['Result'])
X.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin
0,64,1,66,160,83,160.0,1.8,0.012
1,21,1,94,98,46,296.0,6.75,1.06
2,55,1,64,160,77,270.0,1.99,0.003
3,64,1,70,120,55,270.0,13.87,0.122
4,55,1,64,112,65,300.0,1.08,0.003


In [15]:
y = df['Result']
y.head()

0    negative
1    positive
2    negative
3    positive
4    negative
Name: Result, dtype: object

### Encode Positive as 1 and Negative as 0

In [16]:
y = y.map({"negative": 0, "positive": 1}).astype(int)
y.head()

0    0
1    1
2    0
3    1
4    0
Name: Result, dtype: int64

### Test/Train Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### Scale Data

In [18]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Prepare Data for Neural Network (Convert to PyTorch Tensors)

In [19]:
# Convert scaled data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # Shape: (n, 1) for BCELoss
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)    # Shape: (n, 1) for BCELoss

# Move to device (GPU if available, otherwise CPU)
X_train_tensor = X_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

print('Training samples:', X_train_tensor.shape[0])
print('Test samples:', X_test_tensor.shape[0])
print('Number of features:', X_train_tensor.shape[1])
print('\nTensor shapes:')
print('X_train_tensor:', X_train_tensor.shape)
print('X_test_tensor:', X_test_tensor.shape)
print('y_train_tensor:', y_train_tensor.shape)
print('y_test_tensor:', y_test_tensor.shape)
print('\nDevice:', device)

Training samples: 1055
Test samples: 264
Number of features: 8

Tensor shapes:
X_train_tensor: torch.Size([1055, 8])
X_test_tensor: torch.Size([264, 8])
y_train_tensor: torch.Size([1055, 1])
y_test_tensor: torch.Size([264, 1])

Device: cpu


## Neural Network (NN)

In [20]:
class HeartAttackNN(nn.Module):
    def __init__(self, input_size=8, hidden1=32, hidden2=16, hidden3=8, output_size=1):
        super(HeartAttackNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3) 
        self.fc4 = nn.Linear(hidden3, output_size)

    def forward(self, x):
        out = torch.relu(self.fc1(x)) #ReLU to add non-linearity while dealing with vanishing gradients
        out = torch.relu(self.fc2(x))
        out = torch.relu(self.fc3(x)) 
        out = torch.sigmoid(self.fc4(x)) #sigmoid for binary classification
        return out

model_heart = HeartAttackNN().to(device)
print(model_heart)
    
        

HeartAttackNN(
  (fc1): Linear(in_features=8, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=8, bias=True)
  (fc4): Linear(in_features=8, out_features=1, bias=True)
)


### Hyperparameters for NN

In [21]:
input_size = 8
hidden1 = 32
hidden2 = 16
hidden3 = 8
output_size = 1
learning_rate = 0.01
num_epochs = 1000

#Model initialization
model = HeartAttackNN(input_size, hidden1, hidden2, hidden3, output_size)


### Loss Function and Optimizer

In [22]:
criterion = nn.BCELoss()  # Binary Cross Entropy
criterion

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)

#### Training Loop For NN

In [23]:
def train_nn(model, X_train, y_train, X_test, y_test, num_epochs, learning_rate):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
    return model
        

#### Evaluation of NN

In [24]:
@torch.no_grad() #Disables gradient calculation for evaluation
def eval_nn(model, X_test, y_test):
    model.eval()
    y_prob = model(X_test).cpu().numpy().ravel()
    y_pred = (y_prob >= 0.5).astype(int)

    return {
        "accuracy": accuracy_score(y_test.cpu().numpy(), y_pred),
        "precision": precision_score(y_test.cpu().numpy(), y_pred),
        "recall": recall_score(y_test.cpu().numpy(), y_pred),
        "f1": f1_score(y_test.cpu().numpy(), y_pred),
        "auroc": roc_auc_score(y_test.cpu().numpy(), y_prob),
        "auprc": average_precision_score(y_test.cpu().numpy(), y_prob),
        "y_prob": y_prob,
        "y_pred": y_pred,
    }

# Random Forest Classifier

### Hyperparameter Options for Grid Search

In [25]:
hyparam_grid_rfc = {
    'n_estimators': [300, 600],
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 0.5]
}

### Initialize Random Forest Classifier

In [None]:
model_rfc = RandomForestClassifier(random_state=seed)

### Run a Grid Search to Find Best Hyperparameters

In [None]:
grid_rfc = GridSearchCV(
    estimator=model_rfc,
    scoring='accuracy',
    cv=3,
    param_grid=hyparam_grid_rfc,
    n_jobs=-1
)

In [None]:
grid_rfc.fit(X_train, y_train)

In [None]:
print('Best accuracy: ', grid_rfc.best_score_)
print('Best Set of Hyperparameters: ', grid_rfc.best_params_)

### Assign the Best Hyperparameters to the Random Forest Classifier

In [30]:
model_rfc = grid_rfc.best_estimator_

### Examine Performance of Random Forest Classifier Model

In [31]:
y_pred = model_rfc.predict(X_test)

acc_rfc = accuracy_score(y_test, y_pred)
auc_rfc = roc_auc_score(y_test, y_pred)

print('Accuracy of Random Forest Classifier: ', acc_rfc)
print('AUC of Random Forest Classifier: ', auc_rfc)

Accuracy of Random Forest Classifier:  0.9848484848484849
AUC of Random Forest Classifier:  0.9839640405758368


# XGBoost Training

The parameter grid is as follows:

In [32]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

In [38]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

grid = GridSearchCV(
    estimator=xgb,
    scoring='accuracy',
    cv=3,
    param_grid=param_grid,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

xgb = grid.best_estimator_

y_val_pred = xgb.predict(X_test)
y_val_proba = xgb.predict_proba(X_test)[:, 1]

acc_xgb = accuracy_score(y_test, y_val_pred)
auc_xgb = roc_auc_score(y_test, y_val_proba)

print(f'Accuracy for XGBoost: {acc_xgb:.2%}')
print(f'AUC for XGBoost: {auc_xgb:.2f}')


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


0.993371212121212
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
Accuracy for XGBoost: 98.11%
AUC for XGBoost: 0.99


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.
