In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import (
    roc_curve, auc,
    precision_recall_curve, average_precision_score,
    confusion_matrix, f1_score, roc_auc_score
)
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer
import glob

2025-05-07 14:14:00.470487: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-07 14:14:00.470530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-07 14:14:00.492825: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-07 14:14:00.557226: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File Creation

In [2]:
os.makedirs("tp_robust_viz", exist_ok=True)
os.makedirs("tp_robust_log", exist_ok=True)

Plotting Functions

In [28]:
def plot_avg_history(epochs, avg_metrics, std_metrics, noise_vals):

    os.makedirs(f"tp_robust_viz/{epsilons[idx]}", exist_ok=True)
    plt.figure(figsize=(12, 6))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, avg_metrics['loss'], label='Train Loss')
    plt.fill_between(epochs, 
                     np.array(avg_metrics['loss']) - np.array(std_metrics['loss']),
                     np.array(avg_metrics['loss']) + np.array(std_metrics['loss']),
                     alpha=0.2)
    plt.plot(epochs, avg_metrics['val_loss'], label='Val Loss')
    plt.fill_between(epochs, 
                     np.array(avg_metrics['val_loss']) - np.array(std_metrics['val_loss']),
                     np.array(avg_metrics['val_loss']) + np.array(std_metrics['val_loss']),
                     alpha=0.2)
    plt.title("Loss over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, avg_metrics['accuracy'], label='Train Accuracy')
    plt.fill_between(epochs, 
                     np.array(avg_metrics['accuracy']) - np.array(std_metrics['accuracy']),
                     np.array(avg_metrics['accuracy']) + np.array(std_metrics['accuracy']),
                     alpha=0.2)
    plt.plot(epochs, avg_metrics['val_accuracy'], label='Val Accuracy')
    plt.fill_between(epochs, 
                     np.array(avg_metrics['val_accuracy']) - np.array(std_metrics['val_accuracy']),
                     np.array(avg_metrics['val_accuracy']) + np.array(std_metrics['val_accuracy']),
                     alpha=0.2)
    plt.title("Accuracy over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    save_dir = f"tp_robust_viz/{noise_vals[idx]}"
    os.makedirs(save_dir, exist_ok=True)

    # Save plot
    plt.savefig(f"{save_dir}/history_curves.png")
    plt.close()

In [33]:
def plot_mean_se_with_baseline(mean_se_values, x_values, x_label, y_label, filename,
                                baseline, baseline_label="Baseline"):
    
    os.makedirs("tp_robust_viz", exist_ok=True)

    means = np.array([m for m, se in mean_se_values])
    ses = np.array([se for m, se in mean_se_values])

    # Extend data with baseline
    baseline_mean, baseline_se = baseline
    means = np.append(means, baseline_mean)
    ses = np.append(ses, baseline_se)
    x_all = list(x_values) + [baseline_label]

    x_ticks = np.arange(len(x_all))

    plt.figure(figsize=(10, 5))
    plt.errorbar(x_ticks, means, yerr=ses, fmt='o-', capsize=5,
                 color='steelblue', ecolor='gray', elinewidth=2, marker='o')

    plt.xticks(ticks=x_ticks, labels=x_all, rotation=45)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(f"{y_label} across epsilons with SE and Baseline")
    plt.grid(True)
    plt.tight_layout()

    save_dir = f"tp_robust_viz"
    os.makedirs(save_dir, exist_ok=True)

    # Save plot
    plt.savefig(f"{save_dir}/{filename}.png")
    plt.close()
    
    
    plt.close()

In [30]:
def plot_confusion_matrix_with_se(conf_matrix, annotations, noise_vals):
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=annotations, fmt='', cmap="Blues", cbar=False, square=True,
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])

    plt.title(f'Average Confusion Matrix with SE for epsilon {noise_vals[idx]}')
    plt.xlabel('Prediction')
    plt.ylabel('Actual')
    plt.tight_layout()
    save_dir = f"tp_robust_viz/{noise_vals[idx]}"
    os.makedirs(save_dir, exist_ok=True)
    # Save plot
    plt.savefig(f"{save_dir}/avg_confusion_matrix.png")
    plt.close()

    
    plt.close()

In [31]:
def plot_metric_distribution(values, metric_name, filename, noise_vals):
    mean, ci_lower, ci_upper = np_95ci(values)

    plt.figure(figsize=(8, 5))
    plt.hist(values, bins=15, color='skyblue', edgecolor='black', alpha=0.7)
    plt.axvline(mean, color='red', linestyle='--', label=f'Mean = {mean:.3f}')
    plt.axvline(ci_lower, color='green', linestyle=':', label=f'95% CI Lower = {ci_lower:.3f}')
    plt.axvline(ci_upper, color='green', linestyle=':', label=f'95% CI Upper = {ci_upper:.3f}')
    
    plt.title(f'{metric_name} Distribution with 95% CI')
    plt.xlabel(metric_name)
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    # Ensure directory exists
    save_dir = f"tp_robust_viz/{noise_vals[idx]}"
    os.makedirs(save_dir, exist_ok=True)

    # Save plot
    plt.savefig(f"{save_dir}/{filename}.png")
    plt.close()

STATS FUNCTIONS

In [22]:
def np_95ci(data):
    mean = np.mean(data)
    std = np.std(data, ddof=1)  # sample standard deviation
    se = std / np.sqrt(len(data))
    ci_lower = mean - 1.96 * se
    ci_upper = mean + 1.96 * se
    return mean, ci_lower, ci_upper

In [23]:
def mean_se(values):
    values = np.array(values)
    return np.mean(values), np.std(values, ddof=1) / np.sqrt(len(values))

In [24]:
def bootstrap(x_train, y_train):
    x_train = pd.DataFrame(x_train)
    y_train = pd.DataFrame(y_train)
    k = len(x_train)
    idx = np.random.choice(k, size = k,  replace = True)
    return x_train.iloc[idx], y_train.iloc[idx]

Preprocessing Functions

In [6]:
def data_gen(data, test_size =.2, random_state = 42):
    data_holder = []
        
    X = data.drop(columns=['disease']).values
    y = data['disease'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=random_state)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

    data_holder.append((X_train, X_test, y_train, y_test, X_val, y_val))

    return data_holder

Preprocessing

In [7]:
#TODO: You may need to fix file paths to whatever this is on your cluster
data = pd.read_csv('../data/endometriosis_dataset.csv')
datasets = data_gen(data)

#training hyper paramters
batch_size = 32
delta = 1e-5
l2_norm_clip = 1.0
epsilons = [.001, .01, .1, 1, 10]
noise_vals = [100.0, 99.0, 75.03, 8.3, 1.31]

In [8]:
noise_vals_2 = [99.0, 8.3]

In [12]:
noise_vals_3 = [1.31]

Model Training Functions

In [None]:
def model_train(data):
    idx = 0

    for noise_multiplier in noise_vals:

        os.makedirs(f"tp_robust_log/{noise_multiplier}", exist_ok=True)
        os.makedirs(f"models_tp_robust/{noise_multiplier}", exist_ok=True)
        
        for i in range(50):

            X_boot, y_boot = bootstrap(data[idx][0], data[idx][2])
            
    
            DP_model = Sequential([
            Dense(512, activation='relu', input_shape=(data[idx][0].shape[1],)),
            Dropout(0.5),
            Dense(128, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
            
            dp_optimizer = DPKerasAdamOptimizer(
        l2_norm_clip=l2_norm_clip,
        noise_multiplier=noise_multiplier,
        num_microbatches=batch_size,
        learning_rate=1e-3
    )
            loss_fn = tf.keras.losses.BinaryCrossentropy(
                from_logits=False,
                reduction=tf.keras.losses.Reduction.NONE
            )

            DP_model.compile(
                optimizer=dp_optimizer,
                loss=loss_fn,
                metrics=['accuracy']
            )
        
            csv_logger = tf.keras.callbacks.CSVLogger(f"tp_robust_log/{noise_multiplier}/training_log_tp_{i}.csv", append=True)
            
            checkpoint = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"models_tp_robust/{noise_multiplier}/model_tp_{i}_{{epoch:02d}}.keras",
            save_weights_only=False,
            save_best_only=False,  # Save every epoch
            verbose=1
            )
    
            # Create tf.data.Dataset and ensure all batches are complete
            train_dataset = tf.data.Dataset.from_tensor_slices((X_boot, y_boot))
            train_dataset = train_dataset.shuffle(buffer_size=1024)  # optional
            train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

            val_dataset = tf.data.Dataset.from_tensor_slices((data[idx][4], data[idx][5]))
            val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
            
            # Compute number of full steps (batches)
            steps_per_epoch = len(X_boot) // batch_size
            
            # Then fit the model
            DP_model.fit(
                train_dataset,
                epochs=50,
                steps_per_epoch=steps_per_epoch,
                validation_data=val_dataset,
                callbacks=[csv_logger, checkpoint]
            )

In [16]:
def model_train_2(data):
    idx = 0

    for noise_multiplier in noise_vals_3:

        os.makedirs(f"tp_robust_log/{noise_multiplier}", exist_ok=True)
        os.makedirs(f"models_tp_robust/{noise_multiplier}", exist_ok=True)
        
        for i in range(50):

            X_boot, y_boot = bootstrap(data[idx][0], data[idx][2])
            
    
            DP_model = Sequential([
            Dense(512, activation='relu', input_shape=(data[idx][0].shape[1],)),
            Dropout(0.5),
            Dense(128, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
            
            dp_optimizer = DPKerasAdamOptimizer(
        l2_norm_clip=l2_norm_clip,
        noise_multiplier=noise_multiplier,
        num_microbatches=batch_size,
        learning_rate=1e-3
    )
            loss_fn = tf.keras.losses.BinaryCrossentropy(
                from_logits=False,
                reduction=tf.keras.losses.Reduction.NONE
            )

            DP_model.compile(
                optimizer=dp_optimizer,
                loss=loss_fn,
                metrics=['accuracy']
            )
        
            csv_logger = tf.keras.callbacks.CSVLogger(f"tp_robust_log/{noise_multiplier}/training_log_tp_{i}.csv", append=True)
            
            checkpoint = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"models_tp_robust/{noise_multiplier}/model_tp_{i}_{{epoch:02d}}.keras",
            save_weights_only=False,
            save_best_only=False,  # Save every epoch
            verbose=1
            )
    
            # Create tf.data.Dataset and ensure all batches are complete
            train_dataset = tf.data.Dataset.from_tensor_slices((X_boot, y_boot))
            train_dataset = train_dataset.shuffle(buffer_size=1024)  # optional
            train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

            val_dataset = tf.data.Dataset.from_tensor_slices((data[idx][4], data[idx][5]))
            val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
            
            # Compute number of full steps (batches)
            steps_per_epoch = len(X_boot) // batch_size
            
            # Then fit the model
            DP_model.fit(
                train_dataset,
                epochs=50,
                steps_per_epoch=steps_per_epoch,
                validation_data=val_dataset,
                callbacks=[csv_logger, checkpoint]
            )

Model Training

In [14]:
model_train(datasets)


KeyboardInterrupt



In [17]:
model_train_2(datasets)

Epoch 1/50
Epoch 1: saving model to models_tp_robust/1.31/model_tp_0_01.keras
Epoch 2/50
Epoch 2: saving model to models_tp_robust/1.31/model_tp_0_02.keras
Epoch 3/50
Epoch 3: saving model to models_tp_robust/1.31/model_tp_0_03.keras
Epoch 4/50
Epoch 4: saving model to models_tp_robust/1.31/model_tp_0_04.keras
Epoch 5/50
Epoch 5: saving model to models_tp_robust/1.31/model_tp_0_05.keras
Epoch 6/50
Epoch 6: saving model to models_tp_robust/1.31/model_tp_0_06.keras
Epoch 7/50
Epoch 7: saving model to models_tp_robust/1.31/model_tp_0_07.keras
Epoch 8/50
Epoch 8: saving model to models_tp_robust/1.31/model_tp_0_08.keras
Epoch 9/50
Epoch 9: saving model to models_tp_robust/1.31/model_tp_0_09.keras
Epoch 10/50
Epoch 10: saving model to models_tp_robust/1.31/model_tp_0_10.keras
Epoch 11/50
Epoch 11: saving model to models_tp_robust/1.31/model_tp_0_11.keras
Epoch 12/50
Epoch 12: saving model to models_tp_robust/1.31/model_tp_0_12.keras
Epoch 13/50
Epoch 13: saving model to models_tp_robust/1.3

STATS Scripts

In [32]:
f1_total = []
auc_total = []

for idx in range(len(noise_vals)):
    # Step 1: Load all CSVs into a list of DataFrames
    csv_files = glob.glob(f"tp_robust_log/{noise_vals[idx]}/training_log_tp_*.csv")
    histories = [pd.read_csv(f) for f in csv_files]
    
    # Step 2: Stack the metrics for each epoch
    metrics = ['loss', 'accuracy', 'val_loss', 'val_accuracy']
    avg_metrics = {m: [] for m in metrics}
    std_metrics = {m: [] for m in metrics}
    epochs = histories[0]['epoch']  # Assuming all runs have the same epoch range
    
    for epoch in epochs:
        for metric in metrics:
            values = [h.loc[epoch, metric] for h in histories]
            avg_metrics[metric].append(np.mean(values))
            std_metrics[metric].append(np.std(values))
    
    
    model_paths = glob.glob(f"models_tp_robust/{noise_vals[idx]}/model_tp_*_50.keras")
    model_paths.sort()
    
    tp_list = []
    fp_list = []
    tn_list = []
    fn_list = []
    f1_scores = []
    auc_scores = []
    
    
    for index, model_path in enumerate(model_paths):
        model = tf.keras.models.load_model(model_path, compile=False)
        y_pred_prob = model.predict(datasets[0][1])
        y_pred = (y_pred_prob > 0.5).astype(int)
    
        f1 = f1_score(datasets[0][3], y_pred)
        auc = roc_auc_score(datasets[0][3], y_pred_prob)
        
        f1_scores.append(f1)
        auc_scores.append(auc)
        
        tn, fp, fn, tp = confusion_matrix(datasets[0][3], y_pred).ravel()
        
        tp_list.append(tp)
        fp_list.append(fp)
        tn_list.append(tn)
        fn_list.append(fn)
        
    # Compute mean and SE for each confusion matrix component
    tp_mean, tp_se = mean_se(tp_list)
    fp_mean, fp_se = mean_se(fp_list)
    tn_mean, tn_se = mean_se(tn_list)
    fn_mean, fn_se = mean_se(fn_list)
    
    # Construct the matrix and annotation array
    conf_matrix = np.array([[tn_mean, fp_mean],
                            [fn_mean, tp_mean]])
    
    annotations = np.array([[f"{tn_mean:.1f}\n±{tn_se:.1f}", f"{fp_mean:.1f}\n±{fp_se:.1f}"],
                            [f"{fn_mean:.1f}\n±{fn_se:.1f}", f"{tp_mean:.1f}\n±{tp_se:.1f}"]])

    f1_total.append(mean_se(f1_scores))
    auc_total.append(mean_se(auc_scores))

    plot_metric_distribution(f1_scores, "F1 Score", "f1_score_robust", epsilons)
    plot_metric_distribution(auc_scores, "AUC Score", "AUC_score_robust", epsilons)
    plot_confusion_matrix_with_se(conf_matrix, annotations, epsilons)
    plot_avg_history(epochs, avg_metrics, std_metrics, epsilons)



In [34]:
f1_baseline = (0.9659922307568748, 0.002160392362343558)
auc_baseline = (0.896936507936508, 0.00458851871947309)

In [35]:
plot_mean_se_with_baseline(f1_total, epsilons, "epsilons", "Average F1 Score", "avg_f1",
                                f1_baseline, baseline_label="baseline")

  plt.errorbar(x_ticks, means, yerr=ses, fmt='o-', capsize=5,


In [36]:
plot_mean_se_with_baseline(auc_total, epsilons, "epsilons", "Average AUC Score", "avg_auc",
                                auc_baseline, baseline_label="baseline")

  plt.errorbar(x_ticks, means, yerr=ses, fmt='o-', capsize=5,
