In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import BinaryCrossentropy
from imblearn.combine import SMOTETomek

# Calculate entropy
def calculate_entropy(probabilities):
    epsilon = 1e-10
    return -np.mean(np.sum(probabilities * np.log(probabilities + epsilon), axis=1))

# Compute specificity
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Find the best threshold using Youden's J statistic
def find_best_threshold(y_true, y_pred_probs):
    thresholds = np.linspace(0.1, 0.9, 100)
    best_metric = 0
    best_threshold = 0.5
    for threshold in thresholds:
        y_pred = (y_pred_probs >= threshold).astype(int)
        metric = recall_score(y_true, y_pred) + specificity(y_true, y_pred) - 1  # Youden's J
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold
    return best_threshold

# Build a deep model with regularization
def create_advanced_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, kernel_regularizer=l2(0.01)), BatchNormalization(), LeakyReLU(), Dropout(0.5),
        Dense(256, kernel_regularizer=l2(0.01)), BatchNormalization(), LeakyReLU(), Dropout(0.4),
        Dense(128, kernel_regularizer=l2(0.01)), BatchNormalization(), LeakyReLU(), Dropout(0.4),
        Dense(64, kernel_regularizer=l2(0.005)), BatchNormalization(), LeakyReLU(), Dropout(0.3),
        Dense(32, kernel_regularizer=l2(0.005)), BatchNormalization(), LeakyReLU(), Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    return model

# Initialize cross-validation
kf = StratifiedKFold(n_splits=10)
X, y = np.array(X), np.array(y)

# Store metrics
metrics = {
    "accuracy": [], "precision": [], "recall": [], "f1": [], "auc": [], "entropy": [], "conf_matrices": []
}

# Run cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Use SMOTETomek to handle imbalanced data
    smote = SMOTETomek()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Standardize data
    scaler = StandardScaler()
    X_train_resampled = scaler.fit_transform(X_train_resampled)
    X_test = scaler.transform(X_test)

    # Build and compile model
    model = create_advanced_model(input_dim=X_train_resampled.shape[1])
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss=BinaryCrossentropy(label_smoothing=0.1), optimizer=optimizer, metrics=['accuracy'])

    # Callbacks for optimization
    early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, min_lr=0.0001)

    # Train the model
    model.fit(X_train_resampled, y_train_resampled, epochs=7000, batch_size=64, validation_split=0.2,
              callbacks=[early_stopping, reduce_lr], verbose=0)

    # Get predicted probabilities
    y_pred_probs_train = model.predict(X_train_resampled).ravel()
    y_pred_probs_test = model.predict(X_test).ravel()

    # Find best threshold on training set only
    best_threshold = find_best_threshold(y_train_resampled, y_pred_probs_train)
    y_pred_opt = (y_pred_probs_test >= best_threshold).astype(int)

    # Compute metrics on test set
    acc = accuracy_score(y_test, y_pred_opt)
    precision = precision_score(y_test, y_pred_opt)
    recall = recall_score(y_test, y_pred_opt)
    f1 = f1_score(y_test, y_pred_opt)
    auc = roc_auc_score(y_test, y_pred_probs_test)
    entropy = calculate_entropy(np.vstack((1 - y_pred_probs_test, y_pred_probs_test)).T)
    conf_matrix = confusion_matrix(y_test, y_pred_opt)

    # Store metrics
    metrics["accuracy"].append(acc)
    metrics["precision"].append(precision)
    metrics["recall"].append(recall)
    metrics["f1"].append(f1)
    metrics["auc"].append(auc)
    metrics["entropy"].append(entropy)
    metrics["conf_matrices"].append(conf_matrix)

    # Print results for each fold
    print(f'Fold {fold + 1}')
    print(f'Best Threshold (Train): {best_threshold:.2f}')
    print(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, AUC: {auc:.4f}')
    print(f'Entropy: {entropy:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}\n')

# Compute average and std metrics
print(f'Avg Accuracy: {np.mean(metrics["accuracy"]):.4f} ± {np.std(metrics["accuracy"]):.4f}')
print(f'Avg Precision: {np.mean(metrics["precision"]):.4f} ± {np.std(metrics["precision"]):.4f}')
print(f'Avg Recall: {np.mean(metrics["recall"]):.4f} ± {np.std(metrics["recall"]):.4f}')
print(f'Avg F1-score: {np.mean(metrics["f1"]):.4f} ± {np.std(metrics["f1"]):.4f}')
print(f'Avg AUC: {np.mean(metrics["auc"]):.4f} ± {np.std(metrics["auc"]):.4f}')
print(f'Avg Entropy: {np.mean(metrics["entropy"]):.4f} ± {np.std(metrics["entropy"]):.4f}')

# Mean confusion matrix
mean_conf_matrix = np.mean(metrics["conf_matrices"], axis=0).astype(int)
print('Mean Confusion Matrix:\n', mean_conf_matrix)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
import numpy as np

# Define entropy function
def calculate_entropy(probabilities):
    epsilon = 1e-10  # Small constant to avoid log(0)
    return -np.mean(np.sum(probabilities * np.log(probabilities + epsilon), axis=1))

# Function to find the best threshold for F1-score
def find_best_threshold(y_true, y_prob, thresholds):
    best_f1 = 0
    best_threshold = 0.5
    for threshold in thresholds:
        y_pred = (y_prob[:, 1] >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

# Number of folds
n_splits = 10

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store metrics
best_thresholds = []
f1_list = []
accuracy_list = []
precision_list = []
recall_list = []
roc_auc_list = []
entropy_list = []
confusion_matrices = []

# Loop through the StratifiedKFold splits
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the SVM model with probability estimation
    svm_model = SVC(probability=True, random_state=42)
    svm_model.fit(X_train, y_train)

    # Calibrate the model
    calibrated_model = CalibratedClassifierCV(svm_model, method='isotonic', cv=5)
    calibrated_model.fit(X_train, y_train)

    # Get predicted probabilities on training set
    y_prob_train = calibrated_model.predict_proba(X_train)

    # Define a range of thresholds to test
    thresholds = np.arange(0.1, 0.9, 0.01)

    # Find the best threshold based on training set
    best_threshold, _ = find_best_threshold(y_train, y_prob_train, thresholds)
    best_thresholds.append(best_threshold)

    # Get predicted probabilities on test set
    y_prob_test = calibrated_model.predict_proba(X_test)

    # Calculate entropy
    entropy = calculate_entropy(y_prob_test)
    entropy_list.append(entropy)

    # Make predictions using the best threshold
    y_pred = (y_prob_test[:, 1] >= best_threshold).astype(int)

    # Calculate and store ROC AUC
    roc_auc = roc_auc_score(y_test, y_prob_test[:, 1])
    roc_auc_list.append(roc_auc)

    # Store the metrics
    f1_list.append(f1_score(y_test, y_pred))
    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

    print(f'Best Threshold for Fold {len(best_thresholds)}: {best_threshold}')
    print(f'Confusion Matrix for Fold {len(best_thresholds)}:\n', confusion_matrices[-1])

# Calculate mean and standard deviation of each metric
mean_accuracy = np.mean(accuracy_list)
std_accuracy = np.std(accuracy_list)
mean_precision = np.mean(precision_list)
std_precision = np.std(precision_list)
mean_recall = np.mean(recall_list)
std_recall = np.std(recall_list)
mean_f1 = np.mean(f1_list)
std_f1 = np.std(f1_list)
mean_roc_auc = np.mean(roc_auc_list)
std_roc_auc = np.std(roc_auc_list)
mean_entropy = np.mean(entropy_list)
std_entropy = np.std(entropy_list)

# Calculate mean confusion matrix
mean_conf_matrix = np.mean(confusion_matrices, axis=0).astype(int)

# Print the results
print('--- Overall Results ---')
print('Mean Accuracy:', mean_accuracy)
print('Accuracy Std Dev:', std_accuracy)
print('Mean Precision:', mean_precision)
print('Precision Std Dev:', std_precision)
print('Mean Recall:', mean_recall)
print('Recall Std Dev:', std_recall)
print('Mean F1-score:', mean_f1)
print('F1-score Std Dev:', std_f1)
print('Mean ROC AUC:', mean_roc_auc)
print('ROC AUC Std Dev:', std_roc_auc)
print('Mean Entropy:', mean_entropy)
print('Entropy Std Dev:', std_entropy)
print('Mean Confusion Matrix:\n', mean_conf_matrix)