In [None]:
import pandas as pd
import numpy as np

# EDA
import matplotlib.pyplot as plt
import seaborn as sns

## **Data Understanding**

### **Data Collecting**

In [None]:
df = pd.read_csv("Heart_Disease_Prediction (1).csv")
df.head()

### **Exploratory Data Analysis**

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# checking for null values
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
# checking the duplicates values
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in df.columns:
    df.loc[:, col] = encoder.fit_transform(df[col])

# Convert all columns to numeric df type
df = df.apply(pd.to_numeric)

In [None]:
# Using Pearson Correlation
plt.figure(figsize=(30, 30))
cor = df.corr()
sns.heatmap(data=cor, annot=True, cmap="coolwarm", center=0, linewidths=0.5)
plt.show()

## **Data Preparation**

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(["Heart Disease"], axis=1)
y = df["Heart Disease"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

In [None]:
print(f"Total # of sample in whole dataset: {len(X)}")
print(f"Total # of sample in train dataset: {len(X_train)}")
print(f"Total # of sample in test dataset: {len(X_test)}")

In [None]:
# Assuming df is your DataFrame
numerical_features = X.select_dtypes(include=["int", "float"]).columns.tolist()

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the numerical features in the training set and transform them
X_train = scaler.fit_transform(X_train)

# Transform the numerical features in the test set using the scaler fitted on the training set
X_test = scaler.transform(X_test)

## **ANN Modelling**

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
def checkpoint_callback(
    filepath="best_model_checkpoint.h5",
    monitor="val_loss",
    mode="min",
    save_best_only=True,
    verbose=1,
):
    checkpoint_callback = ModelCheckpoint(
        filepath=filepath,
        monitor=monitor,
        mode=mode,
        save_best_only=save_best_only,
        verbose=verbose,
    )
    return checkpoint_callback


# Define early stopping criteria
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, verbose=0, restore_best_weights=True
)

In [None]:
import time


def create_model(input_size):
    model = Sequential()
    model.add(Dense(64, input_dim=input_size, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


# Function to train the model
def train_model(
    model,
    X_train,
    y_train,
    early_stopping_on=False,
    checkpoint_on=False,
    checkpoint_path=None,
):
    callbacks = []

    # Define checkpoint callback
    if checkpoint_on:
        # Create dynamic checkpoint directory if it doesn't exist
        # Add checkpoint callback
        checkpoint_callback = ModelCheckpoint(
            filepath=checkpoint_path, monitor="val_loss", save_best_only=True
        )
        callbacks.append(checkpoint_callback)

    # Add early stopping callback if enabled
    if early_stopping_on:
        callbacks.append(early_stopping)

    start_time = time.time()
    model.fit(
        X_train,
        y_train,
        epochs=200,
        batch_size=64,
        validation_split=0.2,
        callbacks=callbacks,
    )
    end_time = time.time()
    return end_time - start_time

In [None]:
# Define the ANN model and its path
ann_best_model_file_path = "heart_failure/models/ANN_best_model.h5"
ann_model = create_model(X_train.shape[1])

# Train the ANN model
ann_training_time = train_model(
    ann_model,
    X_train,
    y_train,
    checkpoint_on=True,
    checkpoint_path=ann_best_model_file_path,
)

## **ANN Model Evaluation**

In [None]:
from keras.models import load_model

# Load a Keras model
ann_best_model = load_model(ann_best_model_file_path)
# Use the trained model to make predictions on the test data
ann_predictions = ann_best_model.predict(X_test)

# If your model outputs probabilities, you might need to convert them to class labels
# For example, if the output is probability of class 1, you can set a threshold to classify as class 1
threshold = 0.5
ann_prediction_class = (ann_predictions > threshold).astype(int)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
ann_accuracy = accuracy_score(y_test, ann_prediction_class)
# Calculate precision
ann_precision = precision_score(y_test, ann_prediction_class)
# Calculate recall
ann_recall = recall_score(y_test, ann_prediction_class)
# Calculate F1-score
ann_f1 = f1_score(y_test, ann_prediction_class)

print("Accuracy:", ann_accuracy)
print("Precision:", ann_precision)
print("Recall:", ann_recall)
print("F1-score:", ann_f1)

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix_ann = confusion_matrix(y_test, ann_prediction_class)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_ann, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.title('Confusion Matrix - ANN')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


## **GA + ANN Model**

In [None]:
import tensorflow as tf
import pygad
import numpy as np
import pandas as pd
from numpy.random import RandomState

In [None]:
seed = 1234
state = RandomState(seed)

In [None]:
def fitness_func(ga_instance, solution, solution_idx):
    selected_features_indices = np.where(solution == 1)[0]
    print(solution)
    X_train_selected = X_train[:, selected_features_indices]
    X_test_selected = X_test[:, selected_features_indices]

    # Get the number of selected features
    input_size = X_train_selected

    model = create_model(input_size.shape[1])  # Pass input size here
    model.fit(
        X_train_selected,
        y_train,
        epochs=200,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )

    accuracy = model.evaluate(X_test_selected, y_test, verbose=0)[1]
    print(accuracy)
    return accuracy

In [None]:
def on_generation(ga_instance):
    global last_fitness
    print(
        "Generation = {generation}".format(generation=ga_instance.generations_completed)
    )
    print("Fitness    = {fitness}".format(fitness=ga_instance.best_solution()[1]))
    print(f"Change     = {ga_instance.best_solution()[1] - last_fitness}")

    last_fitness = ga_instance.best_solution()[1].copy()

In [None]:
last_fitness = 0
num_generations = 50
num_parents_mating = 4
pop_size = 8
num_features = X.shape[1]
gene_space = state.randint(0, 2, num_features)
parent_selection_type = "sss"
crossover_type = "single_point"
mutation_type = "random"

In [None]:
# Create an instance of the GA class
ga_instance = pygad.GA(
    num_generations=num_generations,
    num_parents_mating=num_parents_mating,
    fitness_func=fitness_func,
    on_generation=on_generation,
    sol_per_pop=pop_size,
    num_genes=num_features,
    gene_space=gene_space,
    parent_selection_type=parent_selection_type,
    crossover_type=crossover_type,
    mutation_type=mutation_type,
)

In [None]:
ga_instance.run()

In [None]:
ga_instance.plot_fitness()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Feature of the best solution : {solution}".format(solution=solution))
print(
    "Fitness value of the best solution = {solution_fitness}".format(
        solution_fitness=solution_fitness
    )
)
print("Index of the best solution : {solution_idx}".format(solution_idx=solution_idx))

if ga_instance.best_solution_generation != -1:
    print(
        "Best fitness value reached after {best_solution_generation} generations.".format(
            best_solution_generation=ga_instance.best_solution_generation
        )
    )
print(f"Number of features selected = {sum(solution)}")

In [None]:
# Extract selected features based on global best position
ga_selected_features = np.where(solution > 0.5)[0]
# Train and evaluate model using selected features
X_train_ga_selected = X_train[:, ga_selected_features]
X_test_ga_selected = X_test[:, ga_selected_features]

ga_ann_best_model_file_path = "heart_failure/models/GA_ANN_best_model.h5"
ga_ann_model = create_model(X_train_ga_selected.shape[1])

# Train the ANN model
ga_ann_training_time = train_model(
    ga_ann_model,
    X_train_ga_selected,
    y_train,
    checkpoint_on=True,
    checkpoint_path=ga_ann_best_model_file_path,
)

## **GA + ANN Model Evaluation**

In [None]:
print("Genetic Algorithm Selected Features Indices:", ga_selected_features)
print("Genetic Algorithm Selected Features:", X.columns[ga_selected_features].tolist())

In [None]:
ga_ann_best_model = load_model(ga_ann_best_model_file_path)
ga_ann_prediction = ga_ann_best_model.predict(X_test_ga_selected)

threshold = 0.5
ga_ann_prediction_class = (ga_ann_prediction > threshold).astype(int)

In [None]:
# Calculate accuracy
ga_ann_accuracy = accuracy_score(y_test, ga_ann_prediction_class)

# Calculate precision
ga_ann_precision = precision_score(y_test, ga_ann_prediction_class)

# Calculate recall
ga_ann_recall = recall_score(y_test, ga_ann_prediction_class)

# Calculate F1-score
ga_ann_f1 = f1_score(y_test, ga_ann_prediction_class)

print("Accuracy:", ga_ann_accuracy)
print("Precision:", ga_ann_precision)
print("Recall:", ga_ann_recall)
print("F1-score:", ga_ann_f1)

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix_ga_ann = confusion_matrix(y_test, ga_ann_prediction_class)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_ga_ann, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.title('Confusion Matrix - GA + ANN')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


## **PSO + ANN Model**

In [None]:
# Import necessary libraries
import numpy as np
import random

# Define PSO parameters
num_particles = 10
max_iter = 50
w = 0.5
c1 = 1.5
c2 = 1.5

# Initialize particles
particles_position = np.random.rand(num_particles, X_train.shape[1])
particles_velocity = np.random.uniform(-1, 1, size=(num_particles, X_train.shape[1]))
particles_best_position = particles_position.copy()
particles_best_fitness = np.zeros(num_particles)

global_best_position = np.zeros(X_train.shape[1])
global_best_fitness = float("-inf")


# Define fitness function
def fitness_function(selected_features):
    # Use selected features to train and evaluate the model
    X_train_selected = X_train[:, selected_features.astype(bool)]
    X_test_selected = X_test[:, selected_features.astype(bool)]
    model = create_model(X_train_selected.shape[1])
    model.fit(
        X_train_selected,
        y_train,
        epochs=200,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )
    accuracy = model.evaluate(X_test_selected, y_test, verbose=0)[1]
    return accuracy


# Perform PSO optimization
for iteration in range(max_iter):
    print("Iteration:", iteration)
    for i in range(num_particles):
        # Evaluate fitness for each particle
        fitness = fitness_function(particles_position[i])
        print(f"Particle {i} fitness: {fitness}")

        # Update personal best
        if fitness > particles_best_fitness[i]:
            particles_best_fitness[i] = fitness
            particles_best_position[i] = particles_position[i].copy()

        # Update global best
        if fitness > global_best_fitness:
            global_best_fitness = fitness
            global_best_position = particles_position[i].copy()

    for i in range(num_particles):
        r1 = random.random()
        r2 = random.random()
        cognitive_component = (
            c1 * r1 * (particles_best_position[i] - particles_position[i])
        )
        social_component = c2 * r2 * (global_best_position - particles_position[i])
        particles_velocity[i] = (
            w * particles_velocity[i] + cognitive_component + social_component
        )

        # Apply thresholding for binary decision
        particles_position[i] = 1 / (1 + np.exp(-particles_velocity[i]))

        # Print global best fitness and position
print("Global best fitness:", global_best_fitness)
print("Global best position:", global_best_position)

In [None]:
# Extract selected features based on global best position
pso_selected_features = np.where(global_best_position > 0.45)[0]
# Train and evaluate model using selected features
X_train_pso_selected = X_train[:, pso_selected_features]
X_test_pso_selected = X_test[:, pso_selected_features]

pso_ann_best_model_file_path = "heart_failure/models/PSO_ANN_best_model.h5"
pso_ann_model = create_model(X_train_pso_selected.shape[1])

# Train the ANN model
pso_ann_training_time = train_model(
    pso_ann_model,
    X_train_pso_selected,
    y_train,
    checkpoint_on=True,
    checkpoint_path=pso_ann_best_model_file_path,
)

In [None]:
print("PSO Selected Features Indices:", pso_selected_features)
print("PSO Selected Features:", X.columns[pso_selected_features].tolist())

In [None]:
# Load a Keras model
pso_ann_best_model = load_model(pso_ann_best_model_file_path)
# Use the trained model to make predictions on the test data
pso_ann_prediction = pso_ann_best_model.predict(X_test_pso_selected)

# If your model outputs probabilities, you might need to convert them to class labels
# For example, if the output is probability of class 1, you can set a threshold to classify as class 1
threshold = 0.5
pso_ann_prediction_class = (pso_ann_prediction > threshold).astype(int)

## **PSO + ANN Model Evaluation**

In [None]:

pso_ann_accuracy = accuracy_score(y_test, pso_ann_prediction_class)

# Calculate precision
pso_ann_precision = precision_score(y_test, pso_ann_prediction_class)

# Calculate recall
pso_ann_recall = recall_score(y_test, pso_ann_prediction_class)

# Calculate F1-score
pso_ann_f1 = f1_score(y_test, pso_ann_prediction_class)

print("Accuracy:", pso_ann_accuracy)
print("Precision:", pso_ann_precision)
print("Recall:", pso_ann_recall)
print("F1-score:", pso_ann_f1)

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix_pso_ann = confusion_matrix(y_test, pso_ann_prediction_class)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_pso_ann, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.title('Confusion Matrix - ANN')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


## **Result Comparison**

In [None]:

pso_selected_features= ['Age', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']

ga_selected_features= ['Age', 'Chest pain type', 'EKG results', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']

In [None]:
import pandas as pd

# Define the features
all_features = ['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
                'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
                'ST depression', 'Slope of ST', 'Number of vessels fluro',
                'Thallium']

# Create an empty DataFrame to store the information
comparison_data = []

# Iterate through all features
for feature in all_features:
    pso_selected = 'Yes' if feature in pso_selected_features else 'No'
    ga_selected = 'Yes' if feature in ga_selected_features else 'No'
    comparison_data.append({'Feature': feature,
                            'PSO Selected': pso_selected,
                            'GA Selected': ga_selected})

# Create a DataFrame from the data
comparison_table = pd.DataFrame(comparison_data)

# Display the comparison table
comparison_table

In [None]:
# Plotting the execution times


models = ["ANN", "GA+ANN", "PSO+ANN"]
execution_times = [ann_training_time, ga_ann_training_time, pso_ann_training_time]

# Print the execution times for each model
print("Execution time for ANN model:", ann_training_time)
print("Execution time for GA+ANN model:", ga_ann_training_time)
print("Execution time for PSO+ANN model:", pso_ann_training_time)
plt.figure(figsize=(8, 6))
plt.bar(models, execution_times, color=["blue", "green", "orange"])

plt.xlabel("Models")
plt.ylabel("Execution Time (seconds)")
plt.title("Execution Time Comparison between Different Models")
plt.grid(axis="y")

plt.show()

In [None]:
# Define model names
model_names = ["ANN", "ANN + GA", "ANN + PSO"]

# Define MSE, MAE, and R^2 values for all models
accuracy = [ann_accuracy, ga_ann_accuracy, pso_ann_accuracy]
precision = [ann_precision, ga_ann_precision, pso_ann_precision]
recall = [ann_recall, ga_ann_recall, pso_ann_recall]
f1 = [ann_f1, ga_ann_f1, pso_ann_f1]

In [None]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Plot Accuracy
axes[0, 0].bar(model_names, accuracy, color=["blue", "orange", "green"])
axes[0, 0].set_ylabel("Accuracy")
axes[0, 0].set_title("Comparison of Accuracy")
axes[0, 0].set_ylim(min(accuracy) - 0.01, max(accuracy) + 0.01)  # Adjust y-axis limits

# Plot Precision
axes[0, 1].bar(model_names, precision, color=["blue", "orange", "green"])
axes[0, 1].set_ylabel("Precision")
axes[0, 1].set_title("Comparison of Precision")
axes[0, 1].set_ylim(min(precision) - 0.01, max(precision) + 0.01)  # Adjust y-axis limits

# Plot Recall
axes[1, 0].bar(model_names, recall, color=["blue", "orange", "green"])
axes[1, 0].set_ylabel("Recall")
axes[1, 0].set_title("Comparison of Recall")
axes[1, 0].set_ylim(min(recall) - 0.01, max(recall) + 0.01)  # Adjust y-axis limits

# Plot F1 Score
axes[1, 1].bar(model_names, f1, color=["blue", "orange", "green"])
axes[1, 1].set_ylabel("F1")
axes[1, 1].set_title("Comparison of F1")
axes[1, 1].set_ylim(min(f1) - 0.01, max(f1) + 0.01)  # Adjust y-axis limits

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Add a table
table_data = [
    ["Model", "Accuracy", "Precision", "Recall", "F1"],
    [model_names[0], ann_accuracy, ann_precision, ann_recall, ann_f1],
    [model_names[1], ga_ann_accuracy, ga_ann_precision, ga_ann_recall, ga_ann_f1],
    [model_names[2], pso_ann_accuracy, pso_ann_precision, pso_ann_recall, pso_ann_f1]
]

# Create a DataFrame
comparison_table = pd.DataFrame(table_data[1:], columns=table_data[0])

# Sort the table by Accuracy in descending order and apply background gradient
comparison_table_sorted = comparison_table.sort_values(by='Accuracy', ascending=False)
styled_comparison_table = comparison_table_sorted.style.background_gradient(cmap='Blues')

styled_comparison_table


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(ax, y_true, y_pred, title):
    conf_matrix = confusion_matrix(y_true, y_pred)
    sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', cbar=False, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Assuming you have three different predictions
# ann_prediction_class, ga_ann_prediction_class, pso_ann_prediction_class
# for ANN, GA-ANN, and PSO-ANN classifiers respectively

plot_confusion_matrix(axes[0], y_test, ann_prediction_class, 'ANN')
plot_confusion_matrix(axes[1], y_test, ga_ann_prediction_class, 'GA-ANN')
plot_confusion_matrix(axes[2], y_test, pso_ann_prediction_class, 'PSO-ANN')

plt.tight_layout()
plt.show()
