This notebook contains the hyperparameter tuning for the different models.

# DATA LOADING


In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data_up.csv')

In [3]:
data.head(5)

Unnamed: 0,SEQN,Gender,Age,Annual-Family-Income,Ratio-Family-Income-Poverty,X60-sec-pulse,Systolic,Diastolic,Weight,Height,...,Total-Cholesterol,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Health-Insurance,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke,CoronaryHeartDisease
0,2,1,77,8,5.0,68,98,56,75.4,174.0,...,5.56,1.39,4.7,3,3,1,2,2,2,0
1,5,1,49,11,5.0,66,122,83,92.5,178.3,...,7.21,1.08,5.5,1,1,1,2,2,2,0
2,12,1,37,11,4.93,64,174,99,99.2,180.0,...,4.03,0.98,5.2,2,1,1,2,1,1,0
3,13,1,70,3,1.07,102,130,66,63.6,157.7,...,8.12,1.28,7.6,3,3,1,1,1,2,0
4,14,1,81,5,2.67,72,136,61,75.5,166.2,...,4.5,1.04,5.8,1,1,1,2,2,2,0


In [4]:
data.columns

Index(['SEQN', 'Gender', 'Age', 'Annual-Family-Income',
       'Ratio-Family-Income-Poverty', 'X60-sec-pulse', 'Systolic', 'Diastolic',
       'Weight', 'Height', 'Body-Mass-Index', 'White-Blood-Cells',
       'Lymphocyte', 'Monocyte', 'Eosinophils', 'Basophils', 'Red-Blood-Cells',
       'Hemoglobin', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.',
       'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol',
       'Segmented-Neutrophils', 'Hematocrit', 'Red-Cell-Distribution-Width',
       'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose',
       'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid',
       'Triglycerides', 'Total-Cholesterol', 'HDL', 'Glycohemoglobin',
       'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes',
       'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease'],
      dtype='object')

In [5]:
data.shape

(37079, 51)

In [6]:
data.isnull().sum()

Unnamed: 0,0
SEQN,0
Gender,0
Age,0
Annual-Family-Income,0
Ratio-Family-Income-Poverty,0
X60-sec-pulse,0
Systolic,0
Diastolic,0
Weight,0
Height,0


Conclusion : Data is clean and all the data is numerical
CoronaryHeartDisease is target class. This is a classification problem predicting the heart disease.
SEQN is just representing the row number hence we need not take in the prediction columns.
Here, we have 49 features and 1 target column.
We have originally 37079 rows with no cell being empty.

# DATA ANALYSIS


In [7]:
data.describe()

Unnamed: 0,SEQN,Gender,Age,Annual-Family-Income,Ratio-Family-Income-Poverty,X60-sec-pulse,Systolic,Diastolic,Weight,Height,...,Total-Cholesterol,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Health-Insurance,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke,CoronaryHeartDisease
count,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,...,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0,37079.0
mean,48901.041236,1.513282,48.943661,7.358208,2.559026,72.57925,124.090078,69.919253,80.988276,167.389601,...,5.081713,1.370344,5.676496,1.78384,1.598856,1.218587,1.907333,1.549502,1.796165,0.04067
std,26753.636441,0.49983,18.01044,3.994083,1.624789,12.242108,19.254741,13.575804,20.678734,10.122908,...,1.072682,0.415985,1.050223,0.448324,0.511199,0.461102,0.349674,0.49755,0.402853,0.197527
min,2.0,1.0,20.0,1.0,0.0,32.0,0.0,0.0,32.3,129.7,...,1.53,0.16,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,26120.5,1.0,33.0,4.0,1.14,64.0,111.0,62.0,66.5,160.0,...,4.32,1.07,5.2,2.0,1.0,1.0,2.0,1.0,2.0,0.0
50%,50065.0,2.0,48.0,7.0,2.18,72.0,121.0,70.0,78.2,167.1,...,5.02,1.29,5.4,2.0,2.0,1.0,2.0,2.0,2.0,0.0
75%,71173.5,2.0,63.0,10.0,4.13,80.0,134.0,78.0,92.1,174.6,...,5.74,1.6,5.8,2.0,2.0,1.0,2.0,2.0,2.0,0.0
max,93702.0,2.0,85.0,15.0,5.0,224.0,270.0,132.0,371.0,204.5,...,14.09,5.84,18.8,3.0,3.0,9.0,3.0,2.0,2.0,1.0


# Feature Selection


**ALGORITHM : EBPSO**

Installing MealPy

In [8]:
!pip install mealpy --quiet

Importing necessary libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from mealpy.swarm_based.PSO import C_PSO
from mealpy.utils.space import BinaryVar

Loading Dataset

In [10]:
df = pd.read_csv('data_up.csv')

In [11]:
df.head(5)

Unnamed: 0,SEQN,Gender,Age,Annual-Family-Income,Ratio-Family-Income-Poverty,X60-sec-pulse,Systolic,Diastolic,Weight,Height,...,Total-Cholesterol,HDL,Glycohemoglobin,Vigorous-work,Moderate-work,Health-Insurance,Diabetes,Blood-Rel-Diabetes,Blood-Rel-Stroke,CoronaryHeartDisease
0,2,1,77,8,5.0,68,98,56,75.4,174.0,...,5.56,1.39,4.7,3,3,1,2,2,2,0
1,5,1,49,11,5.0,66,122,83,92.5,178.3,...,7.21,1.08,5.5,1,1,1,2,2,2,0
2,12,1,37,11,4.93,64,174,99,99.2,180.0,...,4.03,0.98,5.2,2,1,1,2,1,1,0
3,13,1,70,3,1.07,102,130,66,63.6,157.7,...,8.12,1.28,7.6,3,3,1,1,1,2,0
4,14,1,81,5,2.67,72,136,61,75.5,166.2,...,4.5,1.04,5.8,1,1,1,2,2,2,0


In [12]:
df.columns

Index(['SEQN', 'Gender', 'Age', 'Annual-Family-Income',
       'Ratio-Family-Income-Poverty', 'X60-sec-pulse', 'Systolic', 'Diastolic',
       'Weight', 'Height', 'Body-Mass-Index', 'White-Blood-Cells',
       'Lymphocyte', 'Monocyte', 'Eosinophils', 'Basophils', 'Red-Blood-Cells',
       'Hemoglobin', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.',
       'Mean-cell-Hemoglobin', 'Platelet-count', 'Mean-Platelet-Vol',
       'Segmented-Neutrophils', 'Hematocrit', 'Red-Cell-Distribution-Width',
       'Albumin', 'ALP', 'AST', 'ALT', 'Cholesterol', 'Creatinine', 'Glucose',
       'GGT', 'Iron', 'LDH', 'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid',
       'Triglycerides', 'Total-Cholesterol', 'HDL', 'Glycohemoglobin',
       'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes',
       'Blood-Rel-Diabetes', 'Blood-Rel-Stroke', 'CoronaryHeartDisease'],
      dtype='object')

In [13]:
X = df.drop(columns=['SEQN', 'CoronaryHeartDisease'])

In [14]:
X.columns

Index(['Gender', 'Age', 'Annual-Family-Income', 'Ratio-Family-Income-Poverty',
       'X60-sec-pulse', 'Systolic', 'Diastolic', 'Weight', 'Height',
       'Body-Mass-Index', 'White-Blood-Cells', 'Lymphocyte', 'Monocyte',
       'Eosinophils', 'Basophils', 'Red-Blood-Cells', 'Hemoglobin',
       'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.', 'Mean-cell-Hemoglobin',
       'Platelet-count', 'Mean-Platelet-Vol', 'Segmented-Neutrophils',
       'Hematocrit', 'Red-Cell-Distribution-Width', 'Albumin', 'ALP', 'AST',
       'ALT', 'Cholesterol', 'Creatinine', 'Glucose', 'GGT', 'Iron', 'LDH',
       'Phosphorus', 'Bilirubin', 'Protein', 'Uric.Acid', 'Triglycerides',
       'Total-Cholesterol', 'HDL', 'Glycohemoglobin', 'Vigorous-work',
       'Moderate-work', 'Health-Insurance', 'Diabetes', 'Blood-Rel-Diabetes',
       'Blood-Rel-Stroke'],
      dtype='object')

Ensuring x is a DataFrame and y is 1D array

In [15]:
y = df['CoronaryHeartDisease']

Defining the fitness Function and Train/test split

In [16]:
def fitness(sol):
    mask = sol > 0.5
    if not mask.any():
        return 1
    Xs = X.loc[:, mask]

    Xtr, Xte, ytr, yte = train_test_split(Xs, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(random_state=42).fit(Xtr, ytr)

    return 1 - accuracy_score(yte, clf.predict(Xte))

Defining Binary Variables

In [17]:
bounds = [BinaryVar() for _ in range(X.shape[1])]
problem = {
    "obj_func": fitness,
    "bounds": bounds,
    "minmax": "min",
}

Run Binary

In [18]:
model = C_PSO(epoch=5, pop_size=20)
result = model.solve(problem)

INFO:mealpy.swarm_based.PSO.C_PSO:C_PSO(epoch=5, pop_size=20, c1=2.05, c2=2.05, w_min=0.4, w_max=0.9)
INFO:mealpy.swarm_based.PSO.C_PSO:>>>Problem: P, Epoch: 1, Current best: 0.04139697950377563, Global best: 0.04139697950377563, Runtime: 272.29701 seconds
INFO:mealpy.swarm_based.PSO.C_PSO:>>>Problem: P, Epoch: 2, Current best: 0.04139697950377563, Global best: 0.04139697950377563, Runtime: 287.22887 seconds
INFO:mealpy.swarm_based.PSO.C_PSO:>>>Problem: P, Epoch: 3, Current best: 0.04139697950377563, Global best: 0.04139697950377563, Runtime: 289.44448 seconds
INFO:mealpy.swarm_based.PSO.C_PSO:>>>Problem: P, Epoch: 4, Current best: 0.04139697950377563, Global best: 0.04139697950377563, Runtime: 272.75832 seconds
INFO:mealpy.swarm_based.PSO.C_PSO:>>>Problem: P, Epoch: 5, Current best: 0.04139697950377563, Global best: 0.04139697950377563, Runtime: 277.83502 seconds


Extracting Best Solution

In [19]:
best_position = result.solution
best_fitness = getattr(result, "solution_fitness", None) or getattr(result, "fitness_history", [None])[-1]

Selected Features

In [20]:
selected_features = np.where(best_position > 0.5)[0]
X_selected = X.iloc[:, selected_features] if selected_features.size > 0 else pd.DataFrame()

Calculating Accuracy and printing Results


In [21]:
if X_selected.shape[1] > 0:
    clf_final = RandomForestClassifier(random_state=42)
    accuracy = np.mean(cross_val_score(clf_final, X_selected, y, cv=5))
else:
    accuracy = 0

print("Best Fitness (1 - Accuracy):", best_fitness)
print("Selected Features Indices:", selected_features)
print("Total Features Selected:", len(selected_features))
print("Accuracy of Selected Features:", accuracy)

Best Fitness (1 - Accuracy): None
Selected Features Indices: [ 1  3  4  5  6  7  9 11 22 23 25 26 27 28 29 31 33 34 36 41 43 44 45 46]
Total Features Selected: 24
Accuracy of Selected Features: 0.9593570498006534


# MODEL SELECTION


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SupportVectorMachine': SVC(random_state=42, probability=True),
    'K-NearestNeighbors': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'GaussianNaiveBayes': GaussianNB()
}

In [23]:
X = df.drop(columns=['SEQN', 'CoronaryHeartDisease'])
y = df['CoronaryHeartDisease']
filtered_X = X.iloc[:, selected_features]

In [24]:
for name, model in models.items():
    X_train, X_test, y_train, y_test = train_test_split(filtered_X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")

RandomForestClassifier Accuracy: 0.9586030204962244
LogisticRegression Accuracy: 0.9581984897518878
SupportVectorMachine Accuracy: 0.9583333333333334
K-NearestNeighbors Accuracy: 0.9572545846817692
DecisionTree Accuracy: 0.9236785329018339
GaussianNaiveBayes Accuracy: 0.9016990291262136


# Hyperparameter Tuning

In [25]:
!pip install optuna pandas scikit-learn



In [26]:
import optuna
from sklearn.metrics import accuracy_score

In [27]:
def objective(trial):
    model_name = trial.suggest_categorical('model_name', list(models.keys()))
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    if model_name == 'RandomForestClassifier':
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            criterion=criterion,
            random_state=42
        )

    elif model_name == 'DecisionTree':
        model = DecisionTreeClassifier(
            max_depth=max_depth,
            criterion=criterion,
            random_state=42
        )

    elif model_name == 'LogisticRegression':
        C = trial.suggest_float('C', 1e-3, 10.0, log=True)
        penalty = trial.suggest_categorical('penalty', ['l2'])
        model = LogisticRegression(
            C=C, penalty=penalty, solver='lbfgs', max_iter=1000, random_state=42
        )

    elif model_name == 'SupportVectorMachine':
        C = trial.suggest_float('C', 1e-3, 10.0, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        model = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)

    elif model_name == 'K-NearestNeighbors':
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)

    elif model_name == 'GaussianNaiveBayes':
        var_smoothing = trial.suggest_float('var_smoothing', 1e-10, 1e-7, log=True)
        model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [28]:
study = optuna.create_study(direction='maximize', study_name='model_comparison')
study.optimize(objective, n_trials=5, timeout=None)
print("Best Trial:")
best_trial = study.best_trial
print(f"Best Model: {best_trial.params['model_name']}")
print(f"Best Accuracy: {best_trial.value:.4f}")
print("Best Hyperparameters:")
for k, v in best_trial.params.items():
    print(f"  {k}: {v}")


[I 2025-10-12 12:19:27,007] A new study created in memory with name: model_comparison
[I 2025-10-12 12:19:27,531] Trial 0 finished with value: 0.9340614886731392 and parameters: {'model_name': 'DecisionTree', 'n_estimators': 104, 'max_depth': 16, 'criterion': 'gini'}. Best is trial 0 with value: 0.9340614886731392.
[I 2025-10-12 12:19:27,552] Trial 1 finished with value: 0.9016990291262136 and parameters: {'model_name': 'GaussianNaiveBayes', 'n_estimators': 90, 'max_depth': 28, 'criterion': 'gini', 'var_smoothing': 3.311339912165814e-09}. Best is trial 0 with value: 0.9340614886731392.
[I 2025-10-12 12:19:34,255] Trial 2 finished with value: 0.9581984897518878 and parameters: {'model_name': 'LogisticRegression', 'n_estimators': 199, 'max_depth': 23, 'criterion': 'gini', 'C': 7.570081215500731, 'penalty': 'l2'}. Best is trial 2 with value: 0.9581984897518878.
[I 2025-10-12 12:19:38,368] Trial 3 finished with value: 0.9581984897518878 and parameters: {'model_name': 'LogisticRegression', 

Best Trial:
Best Model: LogisticRegression
Best Accuracy: 0.9585
Best Hyperparameters:
  model_name: LogisticRegression
  n_estimators: 88
  max_depth: 7
  criterion: gini
  C: 0.012821048506279397
  penalty: l2


# Final Model


In [29]:
selected_model = RandomForestClassifier(n_estimators=89, max_depth=22, criterion='entropy', random_state=42)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(filtered_X, y, test_size=0.2, random_state=42)
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Selected Model Accuracy: {accuracy}")

Selected Model Accuracy: 0.9583333333333334
