In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
# print(heart_disease.metadata) 
  
# variable information 
# print(heart_disease.variables) 


In [5]:
y['num'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [25]:
heart_disease.data.features

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [31]:
y.head()

Unnamed: 0,num
0,0
1,2
2,1
3,0
4,0


In [9]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [35]:
# 2. Fit and save the scaler
scaler = StandardScaler()

In [37]:
X_train_scaled = scaler.fit_transform(X_train)
pickle.dump(scaler, open('heart_disease_scaler.pkl', 'wb'))


In [15]:
feature_order = X_train.columns.to_list()

In [49]:
feature_order

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [17]:
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

In [19]:
# Parameter grid for RandomizedSearchCV
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [3, 4, 5, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3],
    "reg_lambda": [1, 1.5, 2, 3, 5]
}


In [21]:

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,                # Number of parameter settings sampled
    scoring="accuracy",
    cv=5,                     # 5-fold cross-validation
    verbose=1,
    n_jobs=-1,                # Use all CPU cores
    random_state=42
)


In [23]:
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [41]:
best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)



In [43]:
print("Best Parameters:", random_search.best_params_)
print("Validation Accuracy:", random_search.best_score_)
print("Test Accuracy:", acc)

Best Parameters: {'subsample': 0.8, 'reg_lambda': 1, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.6}
Validation Accuracy: 0.5991496598639456
Test Accuracy: 0.5737704918032787


In [45]:
pickle.dump(best_model, open('heart_disease_model.pkl', 'wb'))


In [47]:
# Also save the feature order
pickle.dump(feature_order, open('heart_disease_feature_order.pkl', 'wb'))

print("Model and scaler have been saved!")

Model and scaler have been saved!
