In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, f1_score, matthews_corrcoef,
    balanced_accuracy_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import Callback

import pandas as pd
import sklearn as sklearn
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, recall_score, f1_score, precision_recall_curve, 
    auc, matthews_corrcoef, balanced_accuracy_score
)
from imblearn.metrics import geometric_mean_score
import joblib 
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import json
import os

In [2]:
def create_nn(input_dim, neurons=64, dropout=0.2, activation='relu', optimizer='adam'):

    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(neurons, activation=activation),
        Dropout(dropout),
        Dense(neurons, activation=activation),
        Dropout(dropout),
        Dense(neurons, activation=activation),
        Dropout(dropout),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    return model


In [3]:
class BalancedAccuracyCallback(Callback):
    def __init__(self, X_train, y_train, X_val, y_val):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.train_bal_acc_per_epoch = []
        self.val_bal_acc_per_epoch = []

    def on_epoch_end(self, epoch, logs=None):
        y_train_pred = (self.model.predict(self.X_train, verbose=0) > 0.5).astype(int)
        y_val_pred = (self.model.predict(self.X_val, verbose=0) > 0.5).astype(int)
        self.train_bal_acc_per_epoch.append(balanced_accuracy_score(self.y_train, y_train_pred))
        self.val_bal_acc_per_epoch.append(balanced_accuracy_score(self.y_val, y_val_pred))


In [4]:
def run_grid_search(X_train, y_train, X_val, y_val, param_grid, input_dim):

    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights_dict = dict(enumerate(class_weights))

    best_score = -np.inf
    best_model = None
    best_callback = None
    best_params = None

    for optimizer in param_grid['optimizer']:
        for neurons in param_grid['neurons']:
            for dropout in param_grid['dropout']:
                for activation in param_grid['activation']:
                    for batch_size in param_grid['batch_size']:
                        model = create_nn(input_dim, neurons, dropout, activation, optimizer)
                        callback = BalancedAccuracyCallback(X_train, y_train, X_val, y_val)
                        model.fit(
                            X_train, y_train,
                            batch_size=batch_size,
                            epochs=param_grid['epochs'][0],
                            verbose=0,
                            callbacks=[callback],
                            class_weight=class_weights_dict
                        )

                        y_pred = (model.predict(X_val, verbose=0) > 0.5).astype(int)
                        bal_acc = balanced_accuracy_score(y_val, y_pred)

                        if bal_acc > best_score:
                            best_score = bal_acc
                            best_model = model
                            best_callback = callback
                            best_params = {
                                'optimizer': optimizer,
                                'neurons': neurons,
                                'dropout': dropout,
                                'activation': activation,
                                'batch_size': batch_size
                            }

    return best_model, best_callback, best_params


In [5]:
def run_nested_cv(X, y, outer_cv, split_number=1, model_name="NeuralNet", param_grid=None):

    if param_grid is None:
        param_grid = {
            'optimizer': ['adam'],
            'neurons': [128, 256],
            'dropout': [0.2, 0.4],
            'activation': ['relu'],
            'batch_size': [16, 32],
            'epochs': [120]
        }

    model_dir = "saved_nn_models"
    os.makedirs(model_dir, exist_ok=True)
    results = []
    X_np = X.values if isinstance(X, pd.DataFrame) else X
    y_np = y.values.ravel() if isinstance(y, pd.DataFrame) else y

    for fold_idx, (train_idx, test_idx) in enumerate(outer_cv, start=1):
        X_train, X_test = X_np[train_idx], X_np[test_idx]
        y_train, y_test = y_np[train_idx], y_np[test_idx]

        model, callback, best_params = run_grid_search(
            X_train, y_train, X_test, y_test, param_grid, input_dim=X_np.shape[1]
        )

        # Save model
        model_filename = f"nn_split{split_number}_fold{fold_idx}.keras"
        model_path = os.path.join(model_dir, model_filename)
        model.save(model_path)

        # Evaluate
        y_pred = (model.predict(X_test, verbose=0) > 0.5).astype(int)
        y_proba = model.predict(X_test, verbose=0)
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        pr_auc = auc(recall, precision)

        results.append({
            'split': split_number,
            'fold': fold_idx,
            'model': model_name,
            'best_params': json.dumps(best_params),
            'accuracy': accuracy_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred),
            'mcc': matthews_corrcoef(y_test, y_pred),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
            'g_mean': geometric_mean_score(y_test, y_pred),
            'pr_auc': pr_auc
        })

    return pd.DataFrame(results)


In [6]:
# Load dataset
df = pd.read_csv('processed_ctg_results.csv')
X = df[['LTV', 'baseline', 'std_FHR']]
y = df['target']

# Load saved outer splits
all_outer_splits = joblib.load('outer_splits.pkl')

In [7]:
# Run across all saved outer splits
all_results = []

for split_num, split in enumerate(all_outer_splits, start=1):
    print(f"Running split {split_num}...")
    
    def custom_cv(split):
        for train_idx, test_idx in split:
            yield train_idx, test_idx

    df_split = run_nested_cv(X, y, outer_cv=custom_cv(split), split_number=split_num)
    all_results.append(df_split)# Combine and save
    
final_df = pd.concat(all_results, ignore_index=True)

Running split 1...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 2...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 3...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 4...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 5...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 6...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 7...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 8...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 9...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Running split 10...


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [8]:
# Add mean row at the end 
mean_row = final_df.drop(columns=["split", "fold", "model", "best_params"]).mean()
mean_row["split"] = "mean"
mean_row["fold"] = "mean"
mean_row["model"] = "NeuralNet"
mean_row["best_params"] = "N/A"

# Append and save
final_df = pd.concat([final_df, pd.DataFrame([mean_row])], ignore_index=True)
final_df.to_csv("neuralnet_cv_results.csv", index=False)
