In [5]:
#load in required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import f1_score

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_X.csv')

#See the first 5 rows of the testing data
train_data.head()

Unnamed: 0,PatientID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,247,48,M,ASY,122,275,1,ST,150,Y,2.0,Down,1
1,829,29,M,ATA,130,204,0,LVH,202,N,0.0,Up,0
2,446,54,M,ASY,130,0,0,ST,117,Y,1.4,Flat,1
3,780,64,F,ASY,180,325,0,Normal,154,Y,0.0,Up,0
4,488,65,M,TA,140,252,0,Normal,135,N,0.3,Up,0


Preprocessing data

In [6]:
# Drop the patient ID column since that is not needed for the model
train_data.drop("PatientID", axis=1, inplace=True)

# List out categorical features and numerical features
cat = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
encoders = {}

#for loop to encode the categorical features using LabelEncoder
for col in cat:
    LabelEncoder = LabelEncoder()
    train_data[col] = LabelEncoder.fit_transform(train_data[col])
    encoders[col] = LabelEncoder

#use StandardScaler to scale the numerical features
scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])

# Get the features and labels from the train data
X_train = train_data.drop("HeartDisease", axis=1)
y_train = train_data["HeartDisease"]

Hyperparameter tuning

In [7]:
# provides a paramater grid to search over
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a base model
clf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters using the grid search results
best_clf = grid_search.best_estimator_

# Cross-validate with the best hyperparameters
scores = cross_val_score(best_clf, X_train, y_train, cv=5, scoring='accuracy')

mean_score = scores.mean()
std_score = scores.std()
print(f"Cross-Validation Accuracy: {mean_score:.4f} ± {std_score:.4f}")

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Cross-Validation Accuracy: 0.8736 ± 0.0183


In [8]:
# Split data for F1 score calculation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_clf.fit(X_train_split, y_train_split)
y_val_pred = best_clf.predict(X_val_split)
f1 = f1_score(y_val_split, y_val_pred)
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.9017


  warn(


In [9]:
# preprocess the test data
patient_ids = test_data['PatientID'].values  # Store patient IDs for the submission
test_data.drop("PatientID", axis=1, inplace=True)

# Handle categorical features
for col in cat:
    test_data[col] = encoders[col].transform(test_data[col])

# Scale numerical features
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Predict
y_pred = best_clf.predict(test_data)

# Create a DataFrame for the predictions
submission_df = pd.DataFrame({
    "PatientID": patient_ids,
    "HeartDisease": y_pred
})

# Save to CSV
submission_file_path = "submission-final.csv"
submission_df.to_csv(submission_file_path, index=False)