<a href="https://colab.research.google.com/github/paaltc/SFCN_transferabiliy/blob/nested-cross-validation-code/Nested_cross_validation_w_hyperparameter_optimization_and_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Split functions

In [None]:
import pandas as pd
import numpy as np
def folds_stratify_split(df,folds,strat_var):
  sorted = df.sort_values(by=strat_var)
  df_fold= np.arange(len(df)) % folds
  splits =  [] # list of dfs, one for each fold
  for fold in range(folds):
    fold_subset = df[df_fold == fold].reset_index(drop=True)
    print(fold_subset.head())
    splits.append(fold_subset)
    print(len(fold_subset))
  return splits # list containing split dfs

def Cross_val_splits(splits,fold):
  df_test = splits[fold]
  df_train = []
  for i, x in enumerate(splits):
    if i != fold:
      df_train.append(x)
  df_train = pd.concat(df_train).reset_index(drop=True)
  return df_train,df_test


In [None]:
## create model function

import tensorflow as tf
from sklearn.linear_model import LinearRegression
#from pyment.models.sfcn import RegressionSFCN
#from pyment.utils import load_select_pretrained_weights


def build_model(model_name,config, weights=None):
  assert(config.keys() >= {'learning_rate','weight_decay','dropout'}) # confirming the hyperparameters are present
  if model_name == 'RegressionSFCN':
    MODEL = RegressionSFCN()
    model = load_select_pretrained_weights(MODEL, weights, 'age')
    # Corrected optimizer instantiation
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=config['learning_rate'],
        weight_decay = config['weight_decay']
        )

    model.compile(
        optimizer = optimizer,
        loss = 'mse',
        metrics = ['mae']
        # forward_pass=True when debugging
        )
  elif model_name == 'test':
    model = LinearRegression() # LinearRegression from sklearn does not have compile method
    # This part needs to be adjusted based on how you plan to use LinearRegression
    # For now, I'll keep it as is but note that .compile() is not valid for it.
    # model.compile(
    #     optimizer = keras.optimizers.Adam(learning_rate=config['learning_rate']),
    #     loss = 'mse',
    #     metrics = ['mae']
    # )
  return model

In [None]:
# Returns all combinations of hyperparameters from a dictionary of choices
from itertools import product
def create_hyperparameter_configurations(hyperparameters):
  keys = hyperparameters.keys()
  values = hyperparameters.values()
  configurations = [dict(zip(keys,combo)) for combo in product(*values)]
  len_configurations = len(configurations)
  print(len_configurations)
  return configurations

In [None]:
def train_configuration(df, weights, configuration, checkpoint_nr,epochs, mode):


  if mode =='print':
    print('Hyperparameter configuration:')
    for param_name, param_value in configuration.items():
      print(f'{param_name}= {param_value}')

  elif mode =='test':
    print('test started')
    model = build_model('test',configuration)
    model.fit(
        df['score'],
        df['age'],
        epochs = epochs,
        callbacks =[cb_checkpoint, cb_earlystopo, reduce_lr]
    )
    print('test completed')
    return model # Return the model

  elif mode == 'train':
    model = build_model('RegressionSFCN',configuration, weights) # Pass weights
    # Assuming create_generator is defined elsewhere and works with your df
    # You need to define cb_checkpoint, cb_earlystopo, reduce_lr
    # Example placeholders:
    cb_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
    cb_earlystopo = EarlyStopping(patience=10, monitor='val_loss')
    reduce_lr = ReduceLROnPlateau(factor=0.1, patience=5, monitor='val_loss')

    model.fit(
        create_generator(df),
        epochs = epochs,
        callbacks =[cb_checkpoint, cb_earlystopo, reduce_lr],
    )
    return model


In [None]:
import numpy as np
import pandas as pd
import nibabel as nib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, r2_score
#from pyment.models import RegressionSFCN
#from pyment.utils import load_select_pretrained_weights


# Cross validation script:

def run_cross_validation(df,strat_var,target_var, outer_folds, inner_folds,hyperparameters, mode):
  configurations = create_hyperparameter_configurations(hyperparameters)
  outer_splits = folds_stratify_split(df,outer_folds,strat_var) # Corrected variable name

  outer_results= []

  # outer cross-valildation loop
  for fold in range(len(outer_splits)): # Iterate over indices
    outer_fold_number= fold
    print (f'Outer fold nr:{outer_fold_number}')
    outer_train, outer_test = Cross_val_splits(outer_splits,fold)
    inner_splits = folds_stratify_split(outer_train, inner_folds, strat_var) # Split outer_train
    inner_results=[]

     # inner loop
    for inner_fold_idx in range(len(inner_splits)): # Iterate over indices
      inner_fold_number= [outer_fold_number,inner_fold_idx]
      print (f'Inner fold nr:{inner_fold_number}')
      inner_train, inner_val = Cross_val_splits(inner_splits,inner_fold_idx) # Changed inner_test to inner_val


      best_mae = float('inf')
      best_config = None
      # maes = []
      # df of configurations, runs, performance,
      # hyperparameter tuning loop, similar to sklearn. grid_searchCV
      for configuration in configurations:
        model = train_configuration(inner_train, configuration=configuration, weights=None, checkpoint_nr=None, epochs=50, mode=mode) # weights and checkpoint_nr are None for now
        if mode =='train':
          predictions = model.predict(create_generator(inner_val))
          mae = tf.keras.metrics.mean_absolute_error(inner_val[target_var], predictions).numpy()
        elif mode =='prototype':
          predictions = model.predict(inner_val[target_var])
          mae = tf.keras.metrics.mean_absolute_error(inner_val[target_var], predictions).numpy()
        elif mode == 'print':
          # Placeholder for evaluation - replace with actual model evaluation
          # For demonstration, let's assume a dummy mae calculation
          dummy_predictions = np.random.rand(len(inner_val)) * 100 # Dummy predictions
          mae = np.mean(np.abs(inner_val[target_var] - dummy_predictions)) # Dummy MAE
        inner_results.append({'config': configuration, 'mae': mae})

        # Track best configuration for this inner fold
        if mae < best_mae:
            best_mae = mae
            best_config = configuration

      # After iterating through all configs for the inner fold, best config is stored, maybe rerun on larger part or just log results, ask
      print(f"Best config for inner fold {inner_fold_number}: {best_config} with MAE: {best_mae}")
      # train_final_model(pd.concat([inner_train, inner_val]), best_config)
      # final_predictions = final_model.predict(create_generator(outer_test))
      # outer_test_mae = tf.keras.metrics.mean_absolute_error(outer_test['score'], final_predictions).numpy()
      # outer_results.append({'outer_fold': outer_fold_number, 'best_config': best_config, 'test_mae': outer_test_mae})

In [None]:
# simulated data for prototyping
hyperparameters ={
'learning_rate': [0.5,0.05,0.005],
'weight_decay' : [0.5,0.05,0.005],
'dropout' : [0.1,0.05,0.005]
}

seed= np.random.seed(42)
n_rows = 100
df = pd.DataFrame({
    'id': range(1, n_rows + 1),  # Integer column: 1 to 100
    'age': np.random.randint(20, 70, n_rows),  # Integer column: random ages
    'score': np.random.uniform(0, 100, n_rows).round(2),  # Float: test scores
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),  # Categorical
    'status': np.random.choice(['Active', 'Inactive', 'Pending'], n_rows),  # Categorical
    'value': np.random.uniform(0, 1000, n_rows).round(2)  # Float: random values
})

# Display basic info
print(f"""Dataset shape: {df.shape})
First few rows:
{df.head()}
Data types:
{df.dtypes}
Basic statistics:
{df.describe()})""")
df_sorted = df.sort_values(by=['score'])
print(df_sorted.head())

Dataset shape: (100, 6))
First few rows:
   id  age  score category   status   value
0   1   58  42.34        D  Pending  542.64
1   2   48  39.49        C   Active  286.54
2   3   34  29.35        D   Active  590.83
3   4   62   1.41        C  Pending   30.50
4   5   27  19.88        B  Pending   37.35
Data types:
id            int64
age           int64
score       float64
category     object
status       object
value       float64
dtype: object
Basic statistics:
               id         age       score       value
count  100.000000  100.000000  100.000000  100.000000
mean    50.500000   44.070000   48.400400  457.780200
std     29.011492   14.447575   27.078183  290.850156
min      1.000000   20.000000    0.050000   14.390000
25%     25.750000   33.000000   25.555000  173.267500
50%     50.500000   43.000000   46.340000  482.895000
75%     75.250000   58.000000   69.497500  686.250000
max    100.000000   69.000000   99.770000  975.850000)
    id  age  score category    status   valu

In [None]:
run_cross_validation(df,'age','age',5,3,hyperparameters,'print')

27
   id  age  score category    status   value
0   1   58  42.34        D   Pending  542.64
1   6   40  71.13        C  Inactive  822.60
2  11   30  91.50        C  Inactive  215.82
3  16   22  66.88        D   Pending  540.64
4  21   49  38.29        C    Active  322.96
20
   id  age  score category    status   value
0   2   48  39.49        C    Active  286.54
1   7   58  79.02        C   Pending  360.19
2  12   43  85.00        D  Inactive  622.89
3  17   41  66.59        C   Pending  637.43
4  22   57  97.17        D   Pending  795.19
20
   id  age  score category    status   value
0   3   34  29.35        D    Active  590.83
1   8   38  60.60        C  Inactive  127.06
2  13   55  44.95        A    Active   85.35
3  18   21  59.13        B  Inactive  726.09
4  23   21  84.89        A    Active  270.83
20
   id  age  score category    status   value
0   4   62   1.41        C   Pending   30.50
1   9   42  92.63        D  Inactive  522.24
2  14   59   9.54        A   Pending   51.6

### Planned changes for the CV script
- store training info like:
    - if early stopping, when did it platou?
    - if dynamically adjusting LR, what was it set to for different layers?
- store results
  - performance by configurations possibly
- store checkpoints
- hyperparameter optimization strategy for inner loop (config for configurations)





## Plan for tomorrow:
- add missing pieces
- run script on actual data
  - epoch = 1
  - no hyperparameters