In [None]:
# --- Installing Libraries ---
!pip install ydata-profiling
!pip install pywaffle
!pip install highlight-text
!pip install Pillow

In [None]:
# --- Importing Libraries ---
import numpy as np
import pandas as pd
import ydata_profiling
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
import os
import yellowbrick
import joblib

from ydata_profiling import ProfileReport
from pywaffle import Waffle
from statsmodels.graphics.gofplots import qqplot
from PIL import Image
from highlight_text import fig_text
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from yellowbrick.classifier import PrecisionRecallCurve, ROCAUC, ConfusionMatrix
from yellowbrick.model_selection import LearningCurve, FeatureImportances
from yellowbrick.contrib.wrapper import wrap
from yellowbrick.style import set_palette

In [None]:
# --- Importing Dataset ---
df = pd.read_csv("heart.csv")

# --- Reading Train Dataset ---
print(clr.start+'.: Imported Dataset :.'+clr.end)
print(clr.color+'*' * 23)
df.head().style.background_gradient(cmap='Reds').hide_index()

In [None]:
# --- Dataset Report ---
ProfileReport(df, title='Heart Disease Dataset Report', minimal=True, progress_bar=False, samples=None, correlations=None, interactions=None, explorative=True, dark_mode=True, notebook={'iframe':{'height': '600px'}}, html={'style':{'primary_color': color_line}}, missing_diagrams={'heatmap': False, 'dendrogram': False}).to_notebook_iframe()

In [None]:
# --- Correlation Map Variables ---
suptitle = dict(x=0.1, y=1.01, fontsize=13, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.1, y=0.98, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
xy_label = dict(size=6)
highlight_textprops = [{'weight':'bold', 'color': colors[0]}, {'weight':'bold', 'color': colors[2]}]

# --- Correlation Map (Heatmap) ---
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(df.corr(), mask=mask, annot=True, cmap=color_map, linewidths=0.2, cbar=False, annot_kws={"size": 7}, rasterized=True)
yticks, ylabels = plt.yticks()
xticks, xlabels = plt.xticks()
ax.set_xticklabels(xlabels, rotation=0, **xy_label)
ax.set_yticklabels(ylabels, **xy_label)
ax.grid(False)
fig_text(s='Numerical Variables Correlation Map', **suptitle)
fig_text(s='<Chest pain type, max heart rate, and slope> positively correlate with <target> variables.', highlight_textprops=highlight_textprops, **title)
plt.tight_layout(rect=[0, 0.04, 1, 1.01])
plt.show();

## <div class="header2">5.1 | Disease Distribution based on Chest Pain Type in Each Gender</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 1 Dataframes ---
df_eda1 = df[['sex', 'cp', 'target']]
df_eda1 = pd.DataFrame(df_eda1.groupby(['sex', 'target']).cp.value_counts().reset_index(name='total'))
df_eda1_mns = df_eda1.query(f'sex == 0 & target == 0')
df_eda1_mns.loc[len(df_eda1_mns.index)] = [0, 0, 3, 0]
df_eda1_ms = df_eda1.query(f'sex == 0 & target == 1')
df_eda1_fns = df_eda1.query(f'sex == 1 & target == 0')
df_eda1_fs = df_eda1.query(f'sex == 1 & target == 1')

# --- EDA 1: Variables ---
y = np.arange(len(df_eda1.cp.unique()))
x_ticks = list(np.arange(-80, 60, 20))
x_labels = list(map(str, x_ticks))
x_labels = list(map(lambda each:each.strip("-"), x_labels))
y_ticks = list(np.arange(0, 4, 1))
labels_pain_type = ['Type 0', 'Type 1', 'Type 2', 'Type 3']
labels_legend = ['Not Sick', 'Sick']
bar_height = 0.35
bar_style = dict(zorder=3, edgecolor='black', linewidth=0.5, alpha=0.85)
cnt_label = dict(fontsize=7, horizontalalignment='center', verticalalignment='center')
axvspan = dict(alpha=0.2, zorder=2)
tick_params = dict(length=3, width=1, color=color_line)
xy_label = dict(fontweight='bold', fontsize=8)
suptitle = dict(x=0.16, y=0.96, fontsize=13, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.16, y=0.93, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
highlight_textprops = [{'weight':'bold', 'color': colors[0]}, {'weight':'bold', 'color': colors[5]}]

# --- Display EDA 1 ---
fig, ax = plt.subplots(figsize=(9, 5))
bar_mns = plt.barh(y+bar_height, df_eda1_mns['total'], color=colors[3], height=bar_height, **bar_style) # hatch='//'
bar_ms = plt.barh(y, df_eda1_ms['total'], color=colors[4], height=bar_height, **bar_style)
bar_fns = plt.barh(y+bar_height, df_eda1_fns['total']*-1, color=colors[3], height=bar_height, **bar_style)
bar_fs = plt.barh(y, df_eda1_fs['total']*-1, color=colors[1], height=bar_height, **bar_style)
ax.set_yticks(y + bar_height / 2)
ax.set_yticklabels(labels_pain_type, fontsize=7)
for rect in ax.patches:
    width, height = rect.get_width(), rect.get_height()
    x, y = rect.get_xy()
    if width >= 0:
        if width > 10: ax.text(x+width/2, y+height/2, '{:.0f}'.format(width), **cnt_label)
        else: ax.text(x+width+1.5, y+height/2, '{:.0f}'.format(width), **cnt_label)
    elif width < 0:
        if width*-1 > 10: ax.text(x+width/2, y+height/2, '{:.0f}'.format(width*-1), **cnt_label)
        else: ax.text(x+width-1.5, y+height/2, '{:.0f}'.format(width*-1), **cnt_label)
plt.xticks(fontsize=7, ticks=x_ticks, labels=x_labels)
plt.xlabel('\nTotal', **xy_label)
plt.ylabel('Chest Pain Type\n', **xy_label)
plt.grid(axis='y', alpha=0, zorder=2)
plt.grid(axis='x', which='major', alpha=0.3, color=color_grid, linestyle='dotted', zorder=1)
plt.axvspan(-85, 0, color=colors[1], **axvspan)
plt.axvspan(40, 0, color=colors[4], **axvspan)
leg_fsick = mpatches.Patch(color=colors[1], label='Sick Female')
leg_msick = mpatches.Patch(color=colors[4], label='Sick Male')
leg_notsick = mpatches.Patch(color=colors[3], label='Not Sick')
plt.legend(handles=[leg_fsick, leg_msick, leg_notsick], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, borderpad=3, frameon=False, fontsize=7, columnspacing=3)
plt.tick_params(bottom='on', **tick_params)
ax=plt.gca()
for spine in ax.spines.values():
    spine.set_color('None')
ax.spines['bottom'].set_visible(True)
ax.spines['bottom'].set_color(color_line)
fig_text(s='Disease Distribution based on Chest Pain Type in Each Gender', **suptitle)
fig_text(s="Chest pain types 1, 2, and 3 <have more sick patients> than those <who don't>.", highlight_textprops=highlight_textprops, **title)
plt.show();

## <div class="header2">5.2 | Maximum Heart Rate vs. Age based on Patients Sickness</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 2 Variables ---
scatter_style = dict(linewidth=0.65, edgecolor=scatter_color_edge, alpha=0.8)
sub_scatter_style_color = dict(s=5, alpha=0.65, linewidth=0.15, zorder=10, edgecolor=scatter_color_edge)
sub_scatter_style_grey = dict(s=5, alpha=0.3, linewidth=0.7, zorder=5, color=colors[4])
grid_style = dict(alpha=0.3, color=color_grid, linestyle='dotted', zorder=1)
xy_label = dict(fontweight='bold', fontsize=9)
suptitle = dict(x=0.12, y=0.62, fontsize=16, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.12, y=0.605, fontsize=10, weight='normal', ha='left', va='bottom', fontname=font_alt)
color_pallete = [colors[5], colors[1]]
target_labels = [[0, 1], ['Not Sick', 'Sick']]
highlight_textprops = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}, {'weight':'bold', 'color': colors[1]}]
highlight_mean = [{'fontsize':7, 'color': 'black'}, {'fontsize':8, 'weight':'bold', 'color': colors[5]}]
sub_axes = [None] * 2

# --- EDA 2 Dataframe & Figure Settings ---
df_eda2 = df[['target', 'age', 'thalach']]
age_mean = df_eda2.age.mean()
thalach_mean = df_eda2.thalach.mean()
fig = plt.figure(figsize=(10, 16))
gs = fig.add_gridspec(2, 2)
ax = fig.add_subplot(gs[:2, :])
ax.set_aspect(1)

# --- EDA 2: Main Scatter Plot ---
ax.axvline(x=thalach_mean, linewidth=0.8, linestyle='--', color=colors[5], alpha=0.5)
ax.axhline(y=age_mean, linewidth=0.8, linestyle='--', color=colors[5], alpha=0.5)
for x in range(len(target_labels[0])):
    df_eda2_temp = df_eda2[df_eda2['target']==target_labels[0][x]]
    ax.scatter(df_eda2_temp['thalach'], df_eda2_temp['age'], s=65, color=color_pallete[x], **scatter_style)
    ax.set_xlabel('\nMaximum Heart Rate', **xy_label)
    ax.set_ylabel('Age\n', **xy_label)
    ax.grid(axis='y', which='major', **grid_style)
    ax.grid(axis='x', which='major', **grid_style)
    for spine in ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        ax.spines[spine].set_visible(True)
        ax.spines[spine].set_color(color_line)
    plt.tick_params(bottom='on', left='on', **tick_params)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
fig_text(x=0.13, y=0.495, ha='left', s="<Age Mean:>\n<{:.2f}>".format(age_mean), highlight_textprops=highlight_mean)
fig_text(x=0.59, y=0.426, ha='left', s="<Max. Heart Rate Mean:>\n<{:.2f}>".format(thalach_mean), highlight_textprops=highlight_mean)

# --- EDA 2: Sub Plots ---
for idx, trgt in enumerate(target_labels[0]):
    gs_thalach = df_eda2[df_eda2['target']!=trgt]['thalach']
    gs_age = df_eda2[df_eda2['target']!=trgt]['age']
    cs_thalach = df_eda2[df_eda2['target']==trgt]['thalach']
    cs_age = df_eda2[df_eda2['target']==trgt]['age']

    sub_axes[idx] = fig.add_subplot(gs[1, idx], aspect=1)
    sub_axes[idx].scatter(gs_thalach, gs_age, label=trgt, **sub_scatter_style_grey)
    sub_axes[idx].scatter(cs_thalach, cs_age, color=color_pallete[idx], label=trgt, **sub_scatter_style_color)
    m, b = np.polyfit(cs_thalach, cs_age, deg=1)
    sub_axes[idx].plot(cs_thalach, m*cs_thalach+b, linewidth=0.5, color=color_pallete[idx], linestyle='dotted');
    
    cnt = (df_eda2['target']==trgt).sum()
    sub_axes[idx].set_title(f'{target_labels[1][trgt]} Patients - ({cnt})', fontsize=7, style='italic', weight='bold', ha='center')
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    for spine in sub_axes[idx].spines.values(): spine.set_color('None')

# --- EDA 2 Titles & WM ---
fig_text(s='Maximum Heart Rate vs. Age based on Patients Sickness', **suptitle)
fig_text(s="Patients who tend to get <heart disease> are <less than 54 years old> and have <max. heart rate over 149>.", highlight_textprops=highlight_textprops, **title)
plt.show();

## <div class="header2">5.3 | Fasting Blood Sugar Distribution by Resting Electrocardiographic Results</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 3 Dataframes ---
df_eda3 = df[['fbs', 'restecg']]
df_eda3 = pd.DataFrame(df_eda3.groupby(['fbs', 'restecg']).size().reset_index(name='total'))
df_eda3.loc[len(df_eda3.index)] = [1, 2, 0]
df_eda3_0 = df_eda3.query(f'restecg == 0').drop('restecg', axis=1)
df_eda3_1 = df_eda3.query(f'restecg == 1').drop('restecg', axis=1)
df_eda3_2 = df_eda3.query(f'restecg == 2').drop('restecg', axis=1)

# --- EDA 3 Variables ---
total_list = [df_eda3_0['total'], df_eda3_1['total'], df_eda3_2['total']]
suptitle = dict(x=0.5, y=0.94, fontsize=14, weight='heavy', ha='center', va='center', fontname=font_main)
exp_text = dict(x=0.5, y=0.17, fontsize=6, weight='normal', ha='center', va='center', textalign='center', fontname=font_alt)
highlight_explanation = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}]
l_120mg = mpatches.Patch(color=colors[5], label='< 120 mg/dl')
m_120mg = mpatches.Patch(color=colors[1], label='> 120 mg/dl')

# --- EDA 3 Functions ---
def display_eda3(subplot_num, restecg_type, total, colors, start_angle):
    centre = plt.Circle((0, 0), 0.85, fc='white', edgecolor='black', linewidth=0.5)
    total_patients = total.sum()
    
    plt.subplot(1, 3, subplot_num)
    plt.tight_layout(rect=[0, 0, 1, 1.01])
    plt.pie(total, colors=colors, autopct='%.2f%%', pctdistance=0.65, startangle=start_angle, wedgeprops=dict(alpha=0.85, edgecolor='black', linewidth=0.5), textprops={'fontsize': 7, 'fontname': font_alt})
    plt.text(0, 0.08, f"Type {restecg_type}", weight='bold', ha='center', fontsize=10, fontname=font_main)
    plt.text(0, -0.08, f"{total_patients} patients", ha='center', fontsize=8, fontname=font_alt)
    fig=plt.gcf()
    fig.gca().add_artist(centre)

# --- Display EDA 3 ---
plt.figure(figsize=(9, 4))
for idx, total in enumerate(total_list):
    display_eda3(idx+1, idx, total, [colors[5], colors[1]], sample_num[idx])
    if idx == 1: plt.legend(handles=[l_120mg, m_120mg], loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2, borderpad=3, frameon=False, fontsize=7, columnspacing=3)
fig_text(s="Fasting Blood Sugar Distribution by Resting Electrocardiographic Results", **suptitle)
fig_text(s="<Resting electrocardiograph type 0 and 1 have higher distribution> compared to type 2.\n<Only type 0 and 1 have patients with fasting blood sugar over 120 mg/dl>, while <type 2 does not>", highlight_textprops=highlight_explanation, **exp_text)
plt.show();

## <div class="header2">5.4 | Number of Major Vessles Distribution based on Exercise Induced Angina</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 4 Dataframes ---
df_eda4 = df[['exang', 'ca']]
df_eda4 = pd.DataFrame(df_eda4.groupby(['exang', 'ca']).size().reset_index(name='total'))
df_eda4_0 = df_eda4.query(f'exang == 0').drop(['exang', 'ca'], axis=1).reset_index(drop=True)
df_eda4_1 = df_eda4.query(f'exang == 1').drop(['exang', 'ca'], axis=1).reset_index(drop=True)

# --- EDA 4 Variables ---
suptitle = dict(x=0.3, y=1.07, fontsize=48, weight='heavy', ha='center', va='center', fontname=font_main)
title = dict(x=0.3, y=1.01, fontsize=30, weight='normal', ha='center', va='bottom', fontname=font_alt)
title_pywaffle = dict(loc='left', fontsize=30, weight='bold', fontname=font_main)
legend_pywaffle = dict(loc='upper center', fontsize=22, ncol=5, borderpad=3, frameon=False, columnspacing=3)

# --- Display EDA 4 ---
fig = plt.figure(FigureClass=Waffle,
    plots={211: {'values': df_eda4_0['total'], 
                 'labels': [f"{key} Major Vessels - ({value})" for key, value in df_eda4_0['total'].items()], 
                 'legend': {'bbox_to_anchor': (0.5, 0.05), **legend_pywaffle},
                 'title': {'label': "Don't Have Exercise Induced Angina\n", **title_pywaffle}}
           , 212: {'values': df_eda4_1['total'], 
                   'labels': [f"{key} Major Vessels - ({value})" for key, value in df_eda4_1['total'].items()], 
                   'legend': {'bbox_to_anchor': (1, 0.05), **legend_pywaffle},
                   'title': {'label': "Have Exercise Induced Angina\n", **title_pywaffle}}
          }, figsize=(50, 20), rows=7, colors=color_pywaffle, rounding_rule='ceil')
fig.suptitle('\nNumber of Major Vessles Distribution based on Exercise Induced Angina', **suptitle)
plt.gcf().text(s='The major vessel distribution proportion in patients with and without exercise-induced angina is almost the same.', **title)
fig.tight_layout()
plt.show();

## <div class="header2">5.5 | Resting Blood Pressure Distribution based on Slope</div>
<!-- Hello world 👋. Thank you so much for downloading/forking my codes/works. If you like my works, please support me by giving upvotes and comments on my Kaggle profile (https://www.kaggle.com/caesarmario/). Thank you so much and have a great day 😆👍. More about me: https://linktr.ee/caesarmario_ -->

In [None]:
# --- EDA 5 Dataframes ---
df_eda5 = df[['slope', 'trestbps']]
df_eda5['slope'] = df_eda5['slope'].astype(str)

# --- EDA 5 Variables ---
tick_params=dict(length=3, width=1, color=color_line)
xy_label=dict(fontweight='bold', fontsize=7)
slope_list = sorted(df_eda5['slope'].unique())
color_pallete = [colors[5], colors[4], colors[0]]
sub_axes=[None] * 3
suptitle = dict(x=0.125, y=0.925, fontsize=14, weight='heavy', ha='left', va='bottom', fontname=font_main)
title = dict(x=0.125, y=0.9, fontsize=8, weight='normal', ha='left', va='bottom', fontname=font_alt)
qq_plot = dict(fit=True, line='45', markeredgecolor=scatter_color_edge)
highlight_textprops = [{'weight':'bold', 'color': colors[5]}, {'weight':'bold', 'color': colors[1]}]

# --- EDA 5 Settings ---
fig = plt.figure(figsize=(10, 7))
gs = fig.add_gridspec(6, 3)
ax = fig.add_subplot(gs[:3, :])

# --- EDA 5: Main KDE Plot ---
sns.kdeplot(x='trestbps', hue='slope', data=df_eda5, palette=color_pallete, hue_order=slope_list, bw_adjust=0.4, fill=True, ax=ax)
plt.legend([], [], frameon=False)
plt.grid(axis='x', which='major', alpha=0.75, color=color_line, linestyle='dotted', zorder=1)
plt.grid(axis='y', alpha=0, zorder=2)
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.xlabel('\nResting Blood Pressure (in mm Hg)', **xy_label)
plt.ylabel('Density\n', **xy_label)
plt.tick_params(left='on', bottom='on', **tick_params)
for spine in ax.spines.values(): spine.set_color('None')
for spine in ['bottom', 'left']:
    ax.spines[spine].set_visible(True)
    ax.spines[spine].set_color(color_line)
fig_text(s='Resting Blood Pressure Distribution based on Slope', **suptitle)
fig_text(s='Each <slope type distribution> is <moderately right-skewed>.', highlight_textprops=highlight_textprops, **title)

# --- EDA 5: Sub Q-Q Plot ---
for idx, slp in enumerate(slope_list):
    df_eda5_slope = df_eda5[df_eda5['slope']==slp]
    sub_axes[idx] = fig.add_subplot(gs[4, idx])
    qqplot(df_eda5['trestbps'], ax=sub_axes[idx], markerfacecolor=color_line, alpha=0.4, **qq_plot)
    qqplot(df_eda5_slope['trestbps'], ax=sub_axes[idx], markerfacecolor=color_pallete[idx], alpha=0.5, **qq_plot)
    for line in [1, 3]:
        sub_axes[idx].get_lines()[line].set_color(colors[5])
        sub_axes[idx].get_lines()[line].set_linewidth(0.8)
        sub_axes[idx].get_lines()[line].set_linestyle('--')
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    sub_axes[idx].set_xlabel('')
    sub_axes[idx].set_ylabel('')
    sub_axes[idx].legend([], [], frameon=False)
    sub_axes[idx].set_title(f'Q-Q Plot - Slope {slp}', fontsize=8, style='italic', weight='bold', ha='center')
    for spines in sub_axes[idx].spines.values(): spines.set_color('None')
plt.show();

In [None]:
# --- Seperating Dependent Features ---
x = df.drop(['target'], axis=1)
y = df['target']

# --- Splitting Dataset ---
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# --- Numerical Pipeline ---
num_column = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
num_pipeline = Pipeline([
    ('scaling', RobustScaler())
])

# --- Categorical Pipeline ---
cat_column = ['cp', 'slope', 'thal']
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# --- Combine Both Pipelines into Transformer ---
preprocessor = ColumnTransformer([
    ('categorical', cat_pipeline, cat_column)
    , ('numerical', num_pipeline, num_column)]
    , remainder='passthrough')

# --- Apply Transformer to Pipeline ---
process_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# --- Apply to Dataframe --- 
x_train_process = process_pipeline.fit_transform(x_train)
x_test_process = process_pipeline.fit_transform(x_test)

In [None]:
# --- Functions: Model Fitting and Performance Evaluation ---
def fit_ml_models(algo, algo_param, algo_name):
    
    # --- Algorithm Pipeline ---
    algo = Pipeline([('algo', algo)])
    
    # --- Apply Grid Search ---
    model = GridSearchCV(algo, param_grid=algo_param, cv=10, n_jobs=-1, verbose=1)
    
    # --- Fitting Model ---
    print(clr.start+f".:. Fitting {algo_name} .:."+clr.end)
    fit_model = model.fit(x_train_process, y_train)
    
    # --- Model Best Parameters ---
    best_params = model.best_params_
    print("\n>> Best Parameters: "+clr.start+f"{best_params}"+clr.end)
    
    # --- Best & Final Estimators ---
    best_model = model.best_estimator_
    best_estimator = model.best_estimator_._final_estimator
    best_score = round(model.best_score_, 4)
    print(">> Best Score: "+clr.start+"{:.3f}".format(best_score)+clr.end)
    
    # --- Create Prediction for Train & Test ---
    y_pred_train = model.predict(x_train_process)
    y_pred_test = model.predict(x_test_process)
    
    # --- Train & Test Accuracy Score ---
    acc_score_train = round(accuracy_score(y_pred_train, y_train)*100, 3)
    acc_score_test = round(accuracy_score(y_pred_test, y_test)*100, 3)
    print("\n"+clr.start+f".:. Train and Test Accuracy Score for {algo_name} .:."+clr.end)
    print("\t>> Train Accuracy: "+clr.start+"{:.2f}%".format(acc_score_train)+clr.end)
    print("\t>> Test Accuracy: "+clr.start+"{:.2f}%".format(acc_score_test)+clr.end)
    
    # --- Classification Report ---
    print("\n"+clr.start+f".:. Classification Report for {algo_name} .:."+clr.end)
    print(classification_report(y_test, y_pred_test))
    
    # --- Figures Settings ---
    xy_label = dict(fontweight='bold', fontsize=12)
    grid_style = dict(color=color_grid, linestyle='dotted', zorder=1)
    title_style = dict(fontsize=14, fontweight='bold')
    tick_params = dict(length=3, width=1, color=color_line)
    bar_style = dict(zorder=3, edgecolor='black', linewidth=0.5, alpha=0.85)
    set_palette(color_yb)
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 14))
    
    # --- Confusion Matrix ---
    conf_matrix = ConfusionMatrix(best_estimator, ax=ax1, cmap='Reds')
    conf_matrix.fit(x_train_process, y_train)
    conf_matrix.score(x_test_process, y_test)
    conf_matrix.finalize()
    conf_matrix.ax.set_title('Confusion Matrix\n', **title_style)
    conf_matrix.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    for spine in conf_matrix.ax.spines.values(): spine.set_color(color_line)
    conf_matrix.ax.set_xlabel('\nPredicted Class', **xy_label)
    conf_matrix.ax.set_ylabel('True Class\n', **xy_label)
    conf_matrix.ax.xaxis.set_ticklabels(['False', 'True'], rotation=0)
    conf_matrix.ax.yaxis.set_ticklabels(['True', 'False'])
    
    # --- ROC AUC ---
    logrocauc = ROCAUC(best_estimator, classes=['False', 'True'], ax=ax2, colors=color_yb)
    logrocauc.fit(x_train_process, y_train)
    logrocauc.score(x_test_process, y_test)
    logrocauc.finalize()
    logrocauc.ax.set_title('ROC AUC Curve\n', **title_style)
    logrocauc.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    logrocauc.ax.grid(axis='both', alpha=0.4, **grid_style)
    for spine in logrocauc.ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        logrocauc.ax.spines[spine].set_visible(True)
        logrocauc.ax.spines[spine].set_color(color_line)
    logrocauc.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
    logrocauc.ax.set_xlabel('\nFalse Positive Rate', **xy_label)
    logrocauc.ax.set_ylabel('True Positive Rate\n', **xy_label)
    
    # --- Learning Curve ---
    lcurve = LearningCurve(best_estimator, scoring='f1_weighted', ax=ax3, colors=color_yb)
    lcurve.fit(x_train_process, y_train)
    lcurve.finalize()
    lcurve.ax.set_title('Learning Curve\n', **title_style)
    lcurve.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
    lcurve.ax.grid(axis='both', alpha=0.4, **grid_style)
    for spine in lcurve.ax.spines.values(): spine.set_color('None')
    for spine in ['bottom', 'left']:
        lcurve.ax.spines[spine].set_visible(True)
        lcurve.ax.spines[spine].set_color(color_line)
    lcurve.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
    lcurve.ax.set_xlabel('\nTraining Instances', **xy_label)
    lcurve.ax.set_ylabel('Scores\n', **xy_label)
    
    # --- Feature Importance or Precision Recall Curve ---
    try:
        feat_importance = FeatureImportances(best_estimator, labels=columns_list_onehot, ax=ax4, topn=5, colors=color_yb_importance)
        feat_importance.fit(x_train_process, y_train)
        feat_importance.finalize()
        feat_importance.ax.set_title('Feature Importances (Top 5 Features)\n', **title_style)
        feat_importance.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
        feat_importance.ax.grid(axis='x', alpha=0.4, **grid_style)
        feat_importance.ax.grid(axis='y', alpha=0, **grid_style)
        for spine in feat_importance.ax.spines.values(): spine.set_color('None')
        for spine in ['bottom']:
            feat_importance.ax.spines[spine].set_visible(True)
            feat_importance.ax.spines[spine].set_color(color_line)
        feat_importance.ax.set_xlabel('\nRelative Importance', **xy_label)
        feat_importance.ax.set_ylabel('Features\n', **xy_label)
    except:
        prec_curve = PrecisionRecallCurve(best_estimator, ax=ax4, ap_score=True, iso_f1_curves=True)
        prec_curve.fit(x_train_process, y_train)
        prec_curve.score(x_test_process, y_test)
        prec_curve.finalize()
        prec_curve.ax.set_title('Precision-Recall Curve\n', **title_style)
        prec_curve.ax.tick_params(axis='both', labelsize=10, bottom='on', left='on', **tick_params)
        for spine in prec_curve.ax.spines.values(): spine.set_color('None')
        for spine in ['bottom', 'left']:
            prec_curve.ax.spines[spine].set_visible(True)
            prec_curve.ax.spines[spine].set_color(color_line)
        prec_curve.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2, borderpad=2, frameon=False, fontsize=10)
        prec_curve.ax.set_xlabel('\nRecall', **xy_label)
        prec_curve.ax.set_ylabel('Precision\n', **xy_label)
        
    plt.suptitle(f'\n{algo_name} Performance Evaluation Report\n', fontsize=18, fontweight='bold')
    plt.tight_layout();
    
    return acc_score_train, acc_score_test, best_score

In [None]:
# --- Logistic Regression Parameters ---
parameter_lr = {"algo__solver": ["lbfgs", "saga", "newton-cg"]
                , "algo__C": [0.1, 0.2, 0.5, 0.8]}

# --- Logistic Regression Algorithm ---
algo_lr = LogisticRegression(penalty="l2", random_state=42, n_jobs=-1)

# --- Applying Logistic Regression ---
acc_score_train_lr, acc_score_test_lr, best_score_lr = fit_ml_models(algo_lr, parameter_lr, "Logistic Regression")

In [None]:
# --- KNN Parameters ---
parameter_knn = {"algo__n_neighbors": [2, 5, 10, 17]
                , "algo__leaf_size": [1, 10, 11, 30]}

# --- KNN Algorithm ---
algo_knn = KNeighborsClassifier(n_jobs=-1)

# --- Applying KNN ---
acc_score_train_knn, acc_score_test_knn, best_score_knn = fit_ml_models(algo_knn, parameter_knn, "K-Nearest Neighbour (KNN)")

In [None]:
# --- SVM Parameters ---
parameter_svc = [
    {'algo__kernel': ['rbf'], 'algo__gamma': np.arange(0.1, 1, 0.1), 'algo__C': np.arange(0.1, 1, 0.1)}
    , {'algo__kernel': ['linear'], 'algo__C': np.arange(0.1, 1, 0.1)}
    , {'algo__kernel': ['poly'], 'algo__degree' : np.arange(1, 10, 1), 'algo__C': np.arange(0.1, 1, 0.1)}
]

# --- SVM Algorithm ---
algo_svc = SVC(random_state=1, probability=True)

# --- Applying SVM ---
acc_score_train_svc, acc_score_test_svc, best_score_svc = fit_ml_models(algo_svc, parameter_svc, "Support Vector Machine (SVM)")

In [None]:
# --- Gaussian NB Parameters ---
parameter_gnb = {"algo__var_smoothing": [1e-2, 1e-3, 1e-4, 1e-6]}

# --- Gaussian NB Algorithm ---
algo_gnb = GaussianNB()

# --- Applying Gaussian NB ---
acc_score_train_gnb, acc_score_test_gnb, best_score_gnb = fit_ml_models(algo_gnb, parameter_gnb, "Gaussian Naive Bayes")

In [None]:
# --- Decision Tree Parameters ---
parameter_dt = {"algo__max_depth": [1, 2, 3]}

# --- Decision Tree Algorithm ---
algo_dt = DecisionTreeClassifier(random_state=42)

# --- Applying Decision Tree ---
acc_score_train_dt, acc_score_test_dt, best_score_dt = fit_ml_models(algo_dt, parameter_dt, "Decision Tree")

In [None]:
# --- Random Forest Parameters ---
parameter_rf = {"algo__max_depth": np.arange(1, 6, 1)}

# --- Random Forest Algorithm ---
algo_rf = RandomForestClassifier(random_state=99, n_jobs=-1)

# --- Applying Random Forest ---
acc_score_train_rf, acc_score_test_rf, best_score_rf = fit_ml_models(algo_rf, parameter_rf, "Random Forest")

In [None]:
# --- Gradient Boosting Parameters ---
parameter_gb = {
    "algo__learning_rate": [0.1, 0.3, 0.5]
    , "algo__n_estimators": [2, 4, 6]
    , "algo__min_weight_fraction_leaf": [0.1, 0.2, 0.5]
}

# --- Gradient Boosting Algorithm ---
algo_gb = GradientBoostingClassifier(loss="exponential", random_state=2)

# --- Applying Gradient Boosting ---
acc_score_train_gb, acc_score_test_gb, best_score_gb = fit_ml_models(algo_gb, parameter_gb, "Gradient Boosting")

In [None]:
# --- Create Accuracy Comparison Table ---
df_compare = pd.DataFrame({'Model': ['Logistic Regression', 'K-Nearest Neighbour', 'Support Vector Machine', 'Gaussian NB',
                                     'Decision Tree', 'Random Forest', 'Gradient Boosting'] 
                           , 'Accuracy Train': [acc_score_train_lr, acc_score_train_knn, acc_score_train_svc, acc_score_train_gnb,
                                                acc_score_train_dt, acc_score_train_rf, acc_score_train_et, acc_score_train_gb, acc_score_train_ab]
                           , 'Accuracy Test': [acc_score_test_lr, acc_score_test_knn, acc_score_test_svc, acc_score_test_gnb,
                                               acc_score_test_dt, acc_score_test_rf, acc_score_test_et, acc_score_test_gb, acc_score_test_ab]
                           , 'Best Score': [best_score_lr, best_score_knn, best_score_svc, best_score_gnb,best_score_dt, best_score_rf, 
                                            best_score_et, best_score_gb, best_score_ab]})

# --- Create Comparison Table ---
print(clr.start+f".:. Models Comparison .:."+clr.end)
print(clr.color+'*' * 26)
df_compare.sort_values(by='Best Score', ascending=False).style.apply(acc_train_vs_test, axis=1).hide_index()

In [None]:
# --- Complete Pipeline: Preprocessor & RF ---
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor)
    , ('algo', RandomForestClassifier(max_depth=3, random_state=99, n_jobs=-1))
])

# --- Save Complete Pipeline (joblib and pickle) ---
file_name = 'pipeline_heart_disease_random_forest'
for ext in ['joblib', 'pkl']:
    joblib.dump(rf_pipeline, f'pipeline/{file_name}.{ext}')

In [None]:
# --- Dataframes to Create Test Output Dataframe ---
rf_pipeline.fit(x_train, y_train)
y_pred_rf = rf_pipeline.predict(x_test)
pred_target = pd.DataFrame(y_pred_rf, columns=['pred_target'])

x_test_output = x_test.reset_index()
actual_target = y_test.to_frame(name='actual_target').reset_index()

# --- Combining and Creating Test Output Dataframe ---
df_test_output = pd.concat([x_test_output, actual_target, pred_target], axis=1).drop('index', axis=1)

# --- Showing Sample Test Output Dataframe ---
print(clr.start+'.: Sample Test Dataframe :.'+clr.end)
print(clr.color+'*' * 28)
df_test_output.sample(n=10, random_state=0).style.apply(act_vs_pred, axis=1).hide_index()

In [None]:
# --- Export to CSV and JSON Files ---
output_name = 'test_data_heart_disease'
df_test_output.to_csv(f'test_data/{output_name}.csv', index=False, sep=',', encoding='utf-8')
df_test_output.to_json(f'test_data/{output_name}.json', orient='index')

In [None]:
# --- Creating Prediction Case Dataframe (50 Rows) ---
df_pred_case = create_prediction_case(x_train, 50)

# --- Showing Dataframe ---
print(clr.start+'.: Prediction Case Dataframe :.'+clr.end)
print(clr.color+'*' * 32)
df_pred_case.sample(n=6, random_state=24).style.background_gradient(cmap='Reds').hide_index()

In [None]:
# --- Creating Prediction using Best Model ---
y_pred_case = rf_pipeline.predict(df_pred_case)

# --- Combining Prediction Case Dataframe w/ Prediction ---
pred_case_target = pd.DataFrame(y_pred_case, columns=['pred_target'])
df_pred_case = pd.concat([df_pred_case, pred_case_target], axis=1)

# --- Showing Final Dataframe ---
print(clr.start+'.: Final Prediction Case Dataframe :.'+clr.end)
print(clr.color+'*' * 38)
df_pred_case.sample(n=6, random_state=24).style.apply(coloring_target_col).hide_index()

In [None]:
# --- Export to CSV and JSON Files ---
pred_output_name = 'pred_case_heart_disease'
df_pred_case.to_csv(f'pred_case/{pred_output_name}.csv', index=False, sep=',', encoding='utf-8')
df_pred_case.to_json(f'pred_case/{pred_output_name}.json', orient='index')