In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops

# =====================================================
# CONFIG
# =====================================================

DATASET = "HAM10000"   # "HAM10000" or "ISIC2019"
CLEAN_DATA_ROOT = r"C:\Users\umair\Videos\PhD\PhD Data\Week 8 Jannuary\Code\CleanData"

# =====================================================
# DATASET PATHS & CLASS MAPS
# =====================================================

if DATASET == "HAM10000":
    IMAGE_DIR = os.path.join(CLEAN_DATA_ROOT, "HAM10000", "images")
    LABEL_CSV = os.path.join(CLEAN_DATA_ROOT, "HAM10000", "HAM10000_metadata")

    CLASS_MAP = {
        "akiec": 0,
        "bcc": 1,
        "bkl": 2,
        "df": 3,
        "mel": 4,
        "nv": 5,
        "vasc": 6
    }

    LBP_OUT = "ham10000_lbp_multiclass.csv"
    GLCM_OUT = "ham10000_glcm_multiclass.csv"

elif DATASET == "ISIC2019":
    IMAGE_DIR = os.path.join(CLEAN_DATA_ROOT, "ISIC2019", "images_train")
    LABEL_CSV = os.path.join(CLEAN_DATA_ROOT, "ISIC2019", "ISIC_2019_Training_GroundTruth.csv")

    CLASS_MAP = {
        "AK": 0,
        "BCC": 1,
        "BKL": 2,
        "DF": 3,
        "MEL": 4,
        "NV": 5,
        "SCC": 6,
        "VASC": 7
    }

    LBP_OUT = "isic2019_lbp_multiclass.csv"
    GLCM_OUT = "isic2019_glcm_multiclass.csv"

else:
    raise ValueError("DATASET must be 'HAM10000' or 'ISIC2019'")

# =====================================================
# LBP CONFIG
# =====================================================

LBP_RADIUS = 2
LBP_POINTS = 8 * LBP_RADIUS
LBP_METHOD = "uniform"

# =====================================================
# GLCM CONFIG
# =====================================================

GLCM_DISTANCES = [1, 2]
GLCM_ANGLES = [0, np.pi/4, np.pi/2, 3*np.pi/4]
GLCM_PROPS = ["contrast", "dissimilarity", "homogeneity", "energy", "correlation"]

# =====================================================
# FEATURE FUNCTIONS
# =====================================================

def extract_lbp(gray):
    lbp = local_binary_pattern(gray, LBP_POINTS, LBP_RADIUS, method=LBP_METHOD)
    hist, _ = np.histogram(
        lbp.ravel(),
        bins=np.arange(0, LBP_POINTS + 3),
        range=(0, LBP_POINTS + 2),
        density=True
    )
    return hist.astype(np.float32)

def extract_glcm(gray):
    glcm = graycomatrix(
        gray,
        distances=GLCM_DISTANCES,
        angles=GLCM_ANGLES,
        levels=256,
        symmetric=True,
        normed=True
    )

    feats = []
    for prop in GLCM_PROPS:
        feats.extend(graycoprops(glcm, prop).ravel())

    return np.array(feats, dtype=np.float32)

# =====================================================
# LOAD LABELS
# =====================================================

labels_df = pd.read_csv(LABEL_CSV)

# =====================================================
# FEATURE EXTRACTION
# =====================================================

lbp_rows = []
glcm_rows = []

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Extracting features"):

    # ---------- LABEL ----------
    if DATASET == "HAM10000":
        image_id = row["image_id"]
        label = CLASS_MAP[row["dx"]]
    else:
        image_id = row["image"]
        label_name = max(CLASS_MAP, key=lambda c: row[c])
        label = CLASS_MAP[label_name]

    # ---------- IMAGE ----------
    img_path = os.path.join(IMAGE_DIR, image_id + ".jpg")
    if not os.path.exists(img_path):
        continue

    img = cv2.imread(img_path)
    if img is None:
        continue

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # ---------- FEATURES ----------
    lbp_feat = extract_lbp(gray)
    glcm_feat = extract_glcm(gray)

    lbp_rows.append(np.concatenate([lbp_feat, [label]]))
    glcm_rows.append(np.concatenate([glcm_feat, [label]]))

# =====================================================
# SAVE LBP CSV
# =====================================================

lbp_feature_names = [f"lbp_{i}" for i in range(len(lbp_feat))] + ["label"]
lbp_df = pd.DataFrame(lbp_rows, columns=lbp_feature_names)
lbp_df.to_csv(LBP_OUT, index=False)

# =====================================================
# SAVE GLCM CSV
# =====================================================

glcm_feature_names = [f"glcm_{i}" for i in range(len(glcm_feat))] + ["label"]
glcm_df = pd.DataFrame(glcm_rows, columns=glcm_feature_names)
glcm_df.to_csv(GLCM_OUT, index=False)

# =====================================================
# DONE
# =====================================================

print("\nFeature extraction completed successfully.")
print(f"LBP  file saved: {LBP_OUT}  | Shape: {lbp_df.shape}")
print(f"GLCM file saved: {GLCM_OUT} | Shape: {glcm_df.shape}")


Extracting features: 100%|██████████| 10015/10015 [40:27<00:00,  4.13it/s]



Feature extraction completed successfully.
LBP  file saved: ham10000_lbp_multiclass.csv  | Shape: (10015, 19)
GLCM file saved: ham10000_glcm_multiclass.csv | Shape: (10015, 41)


In [3]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops

# =====================================================
# CONFIG
# =====================================================

DATASET = "ISIC2019"   # "HAM10000" or "ISIC2019"
CLEAN_DATA_ROOT = r"C:\Users\umair\Videos\PhD\PhD Data\Week 8 Jannuary\Code\CleanData"

# =====================================================
# DATASET PATHS & CLASS MAPS
# =====================================================

if DATASET == "HAM10000":
    IMAGE_DIR = os.path.join(CLEAN_DATA_ROOT, "HAM10000", "images")
    LABEL_CSV = os.path.join(CLEAN_DATA_ROOT, "HAM10000", "HAM10000_metadata")

    CLASS_MAP = {
        "akiec": 0,
        "bcc": 1,
        "bkl": 2,
        "df": 3,
        "mel": 4,
        "nv": 5,
        "vasc": 6
    }

    LBP_OUT = "ham10000_lbp_multiclass.csv"
    GLCM_OUT = "ham10000_glcm_multiclass.csv"

elif DATASET == "ISIC2019":
    IMAGE_DIR = os.path.join(CLEAN_DATA_ROOT, "ISIC2019", "images_train")
    LABEL_CSV = os.path.join(CLEAN_DATA_ROOT, "ISIC2019", "ISIC_2019_Training_GroundTruth.csv")

    CLASS_MAP = {
        "AK": 0,
        "BCC": 1,
        "BKL": 2,
        "DF": 3,
        "MEL": 4,
        "NV": 5,
        "SCC": 6,
        "VASC": 7
    }

    LBP_OUT = "isic2019_lbp_multiclass.csv"
    GLCM_OUT = "isic2019_glcm_multiclass.csv"

else:
    raise ValueError("DATASET must be 'HAM10000' or 'ISIC2019'")

# =====================================================
# LBP CONFIG
# =====================================================

LBP_RADIUS = 2
LBP_POINTS = 8 * LBP_RADIUS
LBP_METHOD = "uniform"

# =====================================================
# GLCM CONFIG
# =====================================================

GLCM_DISTANCES = [1, 2]
GLCM_ANGLES = [0, np.pi/4, np.pi/2, 3*np.pi/4]
GLCM_PROPS = ["contrast", "dissimilarity", "homogeneity", "energy", "correlation"]

# =====================================================
# FEATURE FUNCTIONS
# =====================================================

def extract_lbp(gray):
    lbp = local_binary_pattern(gray, LBP_POINTS, LBP_RADIUS, method=LBP_METHOD)
    hist, _ = np.histogram(
        lbp.ravel(),
        bins=np.arange(0, LBP_POINTS + 3),
        range=(0, LBP_POINTS + 2),
        density=True
    )
    return hist.astype(np.float32)

def extract_glcm(gray):
    glcm = graycomatrix(
        gray,
        distances=GLCM_DISTANCES,
        angles=GLCM_ANGLES,
        levels=256,
        symmetric=True,
        normed=True
    )

    feats = []
    for prop in GLCM_PROPS:
        feats.extend(graycoprops(glcm, prop).ravel())

    return np.array(feats, dtype=np.float32)

# =====================================================
# LOAD LABELS
# =====================================================

labels_df = pd.read_csv(LABEL_CSV)

# =====================================================
# FEATURE EXTRACTION
# =====================================================

lbp_rows = []
glcm_rows = []

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Extracting features"):

    # ---------- LABEL ----------
    if DATASET == "HAM10000":
        image_id = row["image_id"]
        label = CLASS_MAP[row["dx"]]
    else:
        image_id = row["image"]
        label_name = max(CLASS_MAP, key=lambda c: row[c])
        label = CLASS_MAP[label_name]

    # ---------- IMAGE ----------
    img_path = os.path.join(IMAGE_DIR, image_id + ".jpg")
    if not os.path.exists(img_path):
        continue

    img = cv2.imread(img_path)
    if img is None:
        continue

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # ---------- FEATURES ----------
    lbp_feat = extract_lbp(gray)
    glcm_feat = extract_glcm(gray)

    lbp_rows.append(np.concatenate([lbp_feat, [label]]))
    glcm_rows.append(np.concatenate([glcm_feat, [label]]))

# =====================================================
# SAVE LBP CSV
# =====================================================

lbp_feature_names = [f"lbp_{i}" for i in range(len(lbp_feat))] + ["label"]
lbp_df = pd.DataFrame(lbp_rows, columns=lbp_feature_names)
lbp_df.to_csv(LBP_OUT, index=False)

# =====================================================
# SAVE GLCM CSV
# =====================================================

glcm_feature_names = [f"glcm_{i}" for i in range(len(glcm_feat))] + ["label"]
glcm_df = pd.DataFrame(glcm_rows, columns=glcm_feature_names)
glcm_df.to_csv(GLCM_OUT, index=False)

# =====================================================
# DONE
# =====================================================

print("\nFeature extraction completed successfully.")
print(f"LBP  file saved: {LBP_OUT}  | Shape: {lbp_df.shape}")
print(f"GLCM file saved: {GLCM_OUT} | Shape: {glcm_df.shape}")


Extracting features: 100%|██████████| 25331/25331 [2:17:26<00:00,  3.07it/s]  



Feature extraction completed successfully.
LBP  file saved: isic2019_lbp_multiclass.csv  | Shape: (25331, 19)
GLCM file saved: isic2019_glcm_multiclass.csv | Shape: (25331, 41)


In [30]:
import pandas as pd
import numpy as np

# =========================
# CONFIG
# =========================

INPUT_CSV = "isic2019_lbp_multiclass.csv"
OUTPUT_CSV = "isic2019_lbp_multiclass_clean_norm.csv"

LABEL_COL = "label"


In [31]:
# Load data
df = pd.read_csv(INPUT_CSV)

print("Initial shape:", df.shape)
df.head()


Initial shape: (25331, 19)


Unnamed: 0,lbp_0,lbp_1,lbp_2,lbp_3,lbp_4,lbp_5,lbp_6,lbp_7,lbp_8,lbp_9,lbp_10,lbp_11,lbp_12,lbp_13,lbp_14,lbp_15,lbp_16,lbp_17,label
0,0.027909,0.016052,0.021639,0.023069,0.020991,0.033258,0.043783,0.057602,0.055127,0.094185,0.045787,0.080855,0.044272,0.040362,0.032825,0.022049,0.165044,0.17519,5.0
1,0.022377,0.013428,0.022639,0.02095,0.015851,0.027826,0.038123,0.048734,0.044876,0.102694,0.040068,0.09161,0.046321,0.043776,0.037458,0.023365,0.193819,0.166085,5.0
2,0.029187,0.017542,0.023102,0.027211,0.027765,0.04067,0.055784,0.079202,0.078902,0.097269,0.065076,0.069252,0.046294,0.040628,0.031325,0.019006,0.075484,0.176301,4.0
3,0.022043,0.013344,0.018555,0.018645,0.016559,0.028292,0.040338,0.052436,0.053868,0.103538,0.039573,0.088584,0.040079,0.037639,0.032074,0.021151,0.225758,0.147524,5.0
4,0.016508,0.009536,0.012155,0.01356,0.013298,0.020676,0.029706,0.052718,0.073202,0.074809,0.032827,0.04324,0.022023,0.020369,0.016868,0.01137,0.431523,0.105613,4.0


In [32]:
feature_cols = [c for c in df.columns if c != LABEL_COL]

print(f"Number of features: {len(feature_cols)}")
print("Label distribution:")
df[LABEL_COL].value_counts()


Number of features: 18
Label distribution:


label
5.0    12875
4.0     4522
1.0     3323
2.0     2624
0.0      867
6.0      628
7.0      253
3.0      239
Name: count, dtype: int64

In [33]:
# Replace infinite values with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Drop rows containing NaN
df = df.dropna(axis=0)

print("After removing NaN / Inf rows:", df.shape)


After removing NaN / Inf rows: (25331, 19)


In [34]:
constant_features = [c for c in feature_cols if df[c].nunique() <= 1]

print(f"Constant features detected: {len(constant_features)}")

df = df.drop(columns=constant_features)

# Update feature list
feature_cols = [c for c in feature_cols if c not in constant_features]

print("Remaining features:", len(feature_cols))


Constant features detected: 0
Remaining features: 18


In [35]:
def clip_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return series.clip(lower, upper)

# Apply per feature
for col in feature_cols:
    df[col] = clip_outliers_iqr(df[col])

print("Outliers clipped feature-wise using IQR")


Outliers clipped feature-wise using IQR


In [36]:
def min_max_normalize(series):
    min_val = series.min()
    max_val = series.max()
    if max_val > min_val:
        return (series - min_val) / (max_val - min_val)
    else:
        return 0.0

# Normalize each feature independently
for col in feature_cols:
    df[col] = min_max_normalize(df[col])

print("Feature-wise Min–Max normalization applied")


Feature-wise Min–Max normalization applied


In [37]:
# Check range
print("Feature value ranges (should be 0–1):")
df[feature_cols].describe().loc[["min", "max"]]


Feature value ranges (should be 0–1):


Unnamed: 0,lbp_0,lbp_1,lbp_2,lbp_3,lbp_4,lbp_5,lbp_6,lbp_7,lbp_8,lbp_9,lbp_10,lbp_11,lbp_12,lbp_13,lbp_14,lbp_15,lbp_16,lbp_17
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [38]:
df.to_csv(OUTPUT_CSV, index=False)

print("Saved cleaned & normalized dataset:")
print(OUTPUT_CSV)
print("Final shape:", df.shape)


Saved cleaned & normalized dataset:
isic2019_lbp_multiclass_clean_norm.csv
Final shape: (25331, 19)


**With Folds**

In [None]:
import os
import sys
import time
import json
import datetime
import pandas as pd
import numpy as np
import traceback

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# =====================================================
# PATH SETUP
# =====================================================


PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", "Code", "scikit-ExSTraCS-master"))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from skExSTraCS.ExSTraCS import ExSTraCS


# =====================================================
# METRICS (binary-safe, multiclass-ready later)
# =====================================================

def compute_metrics(y_true, y_pred):
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    return bal_acc, cm

# =====================================================
# CROSS-VALIDATION CORE
# =====================================================

def run_cv(csv_path, dataset_name, feature_family, param_grid,
           n_splits=5, out_dir="lcs"):

    print(f"\n=== {dataset_name} | {feature_family} ===")

    csv_path = os.path.join(PROJECT_ROOT, csv_path)
    data = pd.read_csv(csv_path)

    feature_cols = [c for c in data.columns if c not in ("image", "label")]
    X = data[feature_cols].values.astype(float)
    y = data["label"].values.astype(int)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_results = []
    per_fold_records = []

    for params in param_grid:
        print("Params:", params)
        fold_scores = []

        for fold, (tr_idx, te_idx) in enumerate(skf.split(X, y), 1):
            seed = 42 + fold

            # =========================
            # FOLD-WISE NORMALIZATION
            # =========================
            X_tr_raw, X_te_raw = X[tr_idx], X[te_idx]
            y_tr, y_te = y[tr_idx], y[te_idx]

            scaler = StandardScaler()
            X_tr = scaler.fit_transform(X_tr_raw)
            X_te = scaler.transform(X_te_raw)

            # =========================
            # MODEL SETUP
            # =========================
            model = ExSTraCS()
            model.N = params.get("N", 2000)
            model.learningIterations = params.get("learningIterations", 100000)
            model.theta_sel = params.get("theta_sel", 0.8)

            # Minority & specificity bias (as before)
            model.nu = params.get("nu", 3.0)
            model.p_spec = params.get("p_spec", 0.4)
            model.theta_GA = params.get("theta_GA", 15)
            model.chi = params.get("chi", 0.8)
            model.mu = params.get("mu", 0.04)

            model.doSubsumption = True
            model.useBalancedAccuracy = True
            model.randomSeed = seed

            print(
                f"  Fold {fold} | seed={seed} "
                f"N={model.N} iters={model.learningIterations}"
            )

            start = time.time()
            fit_exception = None

            try:
                model.fit(X_tr, y_tr)
                y_pred = model.predict(X_te)
                bal_acc, cm = compute_metrics(y_te, y_pred)
            except Exception:
                fit_exception = traceback.format_exc()
                bal_acc = None
                cm = None

            duration = time.time() - start

            try:
                pop_size = len(model.population.popSet)
            except Exception:
                pop_size = None

            print(
                f"    BA={bal_acc} | time={duration:.1f}s | rules={pop_size}"
            )

            per_fold_records.append({
                "dataset": dataset_name,
                "feature_family": feature_family,
                "params": params,
                "fold": fold,
                "balanced_accuracy": bal_acc,
                "confusion_matrix": cm.tolist() if cm is not None else None,
                "duration_seconds": round(duration, 3),
                "rule_population": pop_size,
                "fit_exception": fit_exception
            })

            if bal_acc is not None:
                fold_scores.append(bal_acc)

        all_results.append({
            "dataset": dataset_name,
            "feature_family": feature_family,
            "params": params,
            "mean_bal_acc": float(np.mean(fold_scores)) if fold_scores else None,
            "std_bal_acc": float(np.std(fold_scores)) if fold_scores else None,
            "timestamp": datetime.datetime.now().isoformat()
        })

    # =========================
    # SAVE RESULTS
    # =========================

    os.makedirs(out_dir, exist_ok=True)
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    folds_path = os.path.join(
        out_dir,
        f"exstracs_{dataset_name}_{feature_family}_folds_{ts}.jsonl"
    )
    summary_path = os.path.join(
        out_dir,
        f"exstracs_{dataset_name}_{feature_family}_summary_{ts}.json"
    )

    with open(folds_path, "w") as fh:
        for rec in per_fold_records:
            fh.write(json.dumps(rec) + "\n")

    with open(summary_path, "w") as fh:
        json.dump(all_results, fh, indent=2)

    print(f"Saved folds   -> {folds_path}")
    print(f"Saved summary -> {summary_path}")

    return pd.DataFrame(all_results)

# =====================================================
# MAIN: SEQUENTIAL EXPERIMENTS
# =====================================================

if __name__ == "__main__":

    param_grid = [
        {"N": 1500, "learningIterations": 100000, "theta_sel": 0.8},
        {"N": 2000, "learningIterations": 120000, "theta_sel": 0.8},
        {"N": 2000, "learningIterations": 120000, "theta_sel": 0.9},
    ]

    experiments = [
        # HAM10000
        ("ham10000_lbp_multiclass.csv",  "HAM10000", "LBP"),
        ("ham10000_glcm_multiclass.csv", "HAM10000", "GLCM"),

        # ISIC2019
        ("isic2019_lbp_multiclass.csv",  "ISIC2019", "LBP"),
        ("isic2019_glcm_multiclass.csv", "ISIC2019", "GLCM"),
    ]

    out_dir = os.path.join(PROJECT_ROOT, "lcs")

    all_runs = []

    for csv_path, dataset, feature_family in experiments:
        df = run_cv(
            csv_path=csv_path,
            dataset_name=dataset,
            feature_family=feature_family,
            param_grid=param_grid,
            n_splits=5,
            out_dir=out_dir
        )
        all_runs.append(df)

    final_df = pd.concat(all_runs, ignore_index=True)

    final_out = os.path.join(
        out_dir,
        f"exstracs_feature_family_comparison_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    )

    final_df.to_csv(final_out, index=False)
    print(f"\nALL DONE. Final comparison CSV saved to:\n{final_out}")


**Without Folds**

In [None]:
import os
import sys
import time
import json
import datetime
import pandas as pd
import numpy as np
import traceback

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

# =====================================================
# PATH SETUP
# =====================================================

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", "Code", "scikit-ExSTraCS-master"))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from skExSTraCS.ExSTraCS import ExSTraCS

# =====================================================
# METRICS
# =====================================================

def compute_metrics(y_true, y_pred):
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    return bal_acc, cm

# =====================================================
# CROSS-VALIDATION CORE
# =====================================================

def run_cv(csv_path, dataset_name, feature_family, param_grid,
           n_splits=5, out_dir="lcs"):

    print(f"\n=== {dataset_name} | {feature_family} ===")

    csv_path = os.path.join(PROJECT_ROOT, csv_path)
    data = pd.read_csv(csv_path)

    feature_cols = [c for c in data.columns if c not in ("image", "label")]
    X = data[feature_cols].values.astype(float)
    y = data["label"].values.astype(int)

    # -------------------------
    # SAFETY CHECK (OPTIONAL)
    # -------------------------
    if np.nanmin(X) < 0.0 or np.nanmax(X) > 1.0:
        print("⚠ WARNING: Features not strictly in [0,1]. "
              "Ensure preprocessing was applied correctly.")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_results = []
    per_fold_records = []

    for params in param_grid:
        print("Params:", params)
        fold_scores = []

        for fold, (tr_idx, te_idx) in enumerate(skf.split(X, y), 1):
            seed = 42 + fold

            # =========================
            # NO NORMALIZATION HERE
            # =========================
            X_tr = X[tr_idx]
            X_te = X[te_idx]
            y_tr, y_te = y[tr_idx], y[te_idx]

            # =========================
            # MODEL SETUP
            # =========================
            model = ExSTraCS()
            model.N = params.get("N", 2000)
            model.learningIterations = params.get("learningIterations", 100000)
            model.theta_sel = params.get("theta_sel", 0.8)

            # Minority & specificity bias
            model.nu = params.get("nu", 3.0)
            model.p_spec = params.get("p_spec", 0.4)
            model.theta_GA = params.get("theta_GA", 15)
            model.chi = params.get("chi", 0.8)
            model.mu = params.get("mu", 0.04)

            model.doSubsumption = True
            model.useBalancedAccuracy = True
            model.randomSeed = seed

            print(
                f"  Fold {fold} | seed={seed} "
                f"N={model.N} iters={model.learningIterations}"
            )

            start = time.time()
            fit_exception = None

            try:
                model.fit(X_tr, y_tr)
                y_pred = model.predict(X_te)
                bal_acc, cm = compute_metrics(y_te, y_pred)
            except Exception:
                fit_exception = traceback.format_exc()
                bal_acc = None
                cm = None

            duration = time.time() - start

            try:
                pop_size = len(model.population.popSet)
            except Exception:
                pop_size = None

            print(
                f"    BA={bal_acc} | time={duration:.1f}s | rules={pop_size}"
            )

            per_fold_records.append({
                "dataset": dataset_name,
                "feature_family": feature_family,
                "params": params,
                "fold": fold,
                "balanced_accuracy": bal_acc,
                "confusion_matrix": cm.tolist() if cm is not None else None,
                "duration_seconds": round(duration, 3),
                "rule_population": pop_size,
                "fit_exception": fit_exception
            })

            if bal_acc is not None:
                fold_scores.append(bal_acc)

        all_results.append({
            "dataset": dataset_name,
            "feature_family": feature_family,
            "params": params,
            "mean_bal_acc": float(np.mean(fold_scores)) if fold_scores else None,
            "std_bal_acc": float(np.std(fold_scores)) if fold_scores else None,
            "timestamp": datetime.datetime.now().isoformat()
        })

    # =========================
    # SAVE RESULTS
    # =========================

    os.makedirs(out_dir, exist_ok=True)
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    folds_path = os.path.join(
        out_dir,
        f"exstracs_{dataset_name}_{feature_family}_folds_{ts}.jsonl"
    )
    summary_path = os.path.join(
        out_dir,
        f"exstracs_{dataset_name}_{feature_family}_summary_{ts}.json"
    )

    with open(folds_path, "w") as fh:
        for rec in per_fold_records:
            fh.write(json.dumps(rec) + "\n")

    with open(summary_path, "w") as fh:
        json.dump(all_results, fh, indent=2)

    print(f"Saved folds   -> {folds_path}")
    print(f"Saved summary -> {summary_path}")

    return pd.DataFrame(all_results)

# =====================================================
# MAIN
# =====================================================

if __name__ == "__main__":

    param_grid = [
        {"N": 1500, "learningIterations": 100000, "theta_sel": 0.8},
        {"N": 2000, "learningIterations": 120000, "theta_sel": 0.8},
        {"N": 2000, "learningIterations": 120000, "theta_sel": 0.9},
    ]

    experiments = [
        ("csv_outputs/ham10000_lbp_multiclass_clean_norm.csv",  "HAM10000", "LBP"),
        ("csv_outputs/ham10000_glcm_multiclass_clean_norm.csv", "HAM10000", "GLCM"),
        ("csv_outputs/isic2019_lbp_multiclass_clean_norm.csv",  "ISIC2019", "LBP"),
        ("csv_outputs/isic2019_glcm_multiclass_clean_norm.csv", "ISIC2019", "GLCM"),
    ]

    out_dir = os.path.join(PROJECT_ROOT, "lcs")

    all_runs = []

    for csv_path, dataset, feature_family in experiments:
        df = run_cv(
            csv_path=csv_path,
            dataset_name=dataset,
            feature_family=feature_family,
            param_grid=param_grid,
            n_splits=5,
            out_dir=out_dir
        )
        all_runs.append(df)

    final_df = pd.concat(all_runs, ignore_index=True)

    final_out = os.path.join(
        out_dir,
        f"exstracs_feature_family_comparison_cleanNorm_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    )

    final_df.to_csv(final_out, index=False)
    print(f"\nALL DONE. Final comparison CSV saved to:\n{final_out}")
