In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint as sp_randint
import joblib

# Load the dataset
df = pd.read_csv('heart_disease.csv')

# Check for missing values and fill them
df.fillna(df.mean(), inplace=True)


df.head(5)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [12]:
# Splitting the dataset into features (X) and target (y)
X = df.drop('HeartDiseaseorAttack', axis=1)
y = df['HeartDiseaseorAttack']

# Normalize and standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [13]:
# Split the data into training and testing sets
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Retrain the model with the best parameters
best_params = {
    'bootstrap': False,
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 2,
    'min_samples_split': 2,
    'n_estimators': 162
}
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train_full, y_train_full)

# Save the best model
joblib.dump(best_model, 'heart_disease_model.pkl')

['heart_disease_model.pkl']

In [14]:
# Evaluate the best model on the full dataset
y_pred_best = best_model.predict(X_test_full)
best_accuracy = accuracy_score(y_test_full, y_pred_best)

print("Optimised model accuracy on full dataset:", best_accuracy)
print("Classification report for the optimised model:")
print(classification_report(y_test_full, y_pred_best))


Optimised model accuracy on full dataset: 0.9080731630400505
Classification report for the optimised model:
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95     45968
         1.0       0.65      0.05      0.09      4768

    accuracy                           0.91     50736
   macro avg       0.78      0.52      0.52     50736
weighted avg       0.89      0.91      0.87     50736

