In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, roc_auc_score
)

df = pd.read_csv("Heart_disease_cleveland_new.csv")

numeric_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_cols = ["sex", "cp", "restecg", "slope", "thal"]

In [None]:
def get_outliers(df, cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    mask = ((df[cols] < lower) | (df[cols] > upper)).any(axis=1)
    outliers = df[mask]

    print("\n===== Outliers Detected =====")
    print(outliers)
    print(f"\nTotal Outlier Rows: {outliers.shape[0]}")

    return outliers


outlier_rows = get_outliers(df, numeric_cols)


===== Outliers Detected =====
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
14    52    1   2       172   199    1        0      162      0      0.5   
48    65    0   2       140   417    1        2      157      0      0.8   
83    68    1   2       180   274    1        2      150      1      1.6   
91    62    0   3       160   164    0        2      145      0      6.2   
121   63    0   3       150   407    0        2      154      0      4.0   
123   55    1   3       140   217    0        0      111      1      5.6   
126   56    0   3       200   288    1        2      133      1      4.0   
152   67    0   2       115   564    0        2      160      0      1.6   
172   59    0   3       174   249    0        0      143      1      0.0   
173   62    0   3       140   394    0        2      157      0      1.2   
181   56    0   3       134   409    0        2      150      1      1.9   
183   59    1   0       178   270    0        2      145 

In [None]:
def evaluate_model(df_in):

    # one-hot encoding
    df_enc = pd.get_dummies(df_in, columns=categorical_cols, drop_first=True)

    X = df_enc.drop("target", axis=1)
    y = df_enc["target"]

    # feature selection (ANOVA)
    selector = SelectKBest(score_func=f_classif, k="all")
    X_selected = selector.fit_transform(X, y)
    selected_cols = X.columns[selector.get_support()]

    X = X[selected_cols]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )

    # base RF model
    rf = RandomForestClassifier(
        random_state=42,
        class_weight="balanced",
        n_estimators=200,
        max_depth=12
    )

    # cross-validation
    cv_scores = cross_val_score(rf, X_train, y_train, cv=10)
    print("CV Mean:", round(cv_scores.mean(), 4))

    # tuning RF with GridSearch
    params = {
        "n_estimators": [150, 200, 300],
        "max_depth": [10, 12, 14],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", "log2"],
    }

    grid = GridSearchCV(
        rf, params, scoring="roc_auc", cv=5, n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best = grid.best_estimator_

    y_pred = best.predict(X_test)
    y_prob = best.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob),
        "Best_Params": grid.best_params_
    }

In [None]:
## Baseline
baseline_results = evaluate_model(df.copy())
baseline_results

CV Mean: 0.8013


{'Accuracy': 0.9016393442622951,
 'Precision': 0.9333333333333333,
 'Recall': 0.875,
 'ROC-AUC': np.float64(0.9407327586206896),
 'Best_Params': {'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 150}}

In [None]:
## Remove Outliers (IQR)
def remove_outliers_iqr(df, cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    mask = ((df[cols] < lower) | (df[cols] > upper)).any(axis=1)
    return df[~mask]


df_removed = remove_outliers_iqr(df.copy(), numeric_cols)
removed_results = evaluate_model(df_removed)
removed_results

CV Mean: 0.7921


{'Accuracy': 0.8596491228070176,
 'Precision': 0.8,
 'Recall': 0.8695652173913043,
 'ROC-AUC': np.float64(0.9526854219948849),
 'Best_Params': {'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 300}}

In [None]:
# Log Transform
df_log = df.copy()

for col in ["trestbps", "chol", "oldpeak"]:
    df_log[col] = np.log1p(df_log[col])

log_results = evaluate_model(df_log)
log_results

CV Mean: 0.8057


{'Accuracy': 0.9016393442622951,
 'Precision': 0.9333333333333333,
 'Recall': 0.875,
 'ROC-AUC': np.float64(0.9418103448275862),
 'Best_Params': {'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 150}}

In [None]:
# Robust Scaler
df_robust = df.copy()

scaler = RobustScaler()
df_robust[numeric_cols] = scaler.fit_transform(df_robust[numeric_cols])

robust_results = evaluate_model(df_robust)
robust_results

CV Mean: 0.8013


{'Accuracy': 0.9016393442622951,
 'Precision': 0.9333333333333333,
 'Recall': 0.875,
 'ROC-AUC': np.float64(0.9407327586206896),
 'Best_Params': {'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 150}}

In [None]:
results = pd.DataFrame({
    "Baseline": baseline_results,
    "Remove Outliers": removed_results,
    "Log Transform": log_results,
    "Robust Scaler": robust_results
})

results

Unnamed: 0,Baseline,Remove Outliers,Log Transform,Robust Scaler
Accuracy,0.901639,0.859649,0.901639,0.901639
Precision,0.933333,0.8,0.933333,0.933333
Recall,0.875,0.869565,0.875,0.875
ROC-AUC,0.940733,0.952685,0.94181,0.940733
Best_Params,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'max_depth': 10, 'max_features': 'sqrt', 'min..."
