In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import sem

In [17]:
N = 50
y = np.random.binomial(1,0.5, N)
X = np.random.randn(N, 6)
group = np.round(np.random.uniform(0,24, N)).reshape(-1,1)



In [18]:


def RandomGroupKFold_split(groups, n, seed=0):  # noqa: N802
    """
    Random analogous of sklearn.model_selection.GroupKFold.split.

    :return: list of (train, test) indices
    """
    groups = pd.Series(groups)
    ix = np.arange(len(groups))
    unique = np.unique(groups)
    np.random.RandomState(seed).shuffle(unique)
    result = []
    for split in np.array_split(unique, n):
        mask = groups.isin(split)
        train, test = ix[~mask], ix[mask]
        result.append((train, test))

    return result


In [None]:
# set some initial training parameters
k = 3
nest_k = 2
runs = 2
 
# set search space for hyperparams
n_estimators = [10, 100, 500]
max_features = [0.7, 0.8]

# set initial seed (it will be incremented on each simulation)
seed = 0

# init storage for scores over all simulations
bal_acc_runs = []

# start simulations
for i in range(runs):
    print('\n===========Simulation: {}============='.format(i))
    # increment seed
    seed += 1
    # shuffle input data
    indices = np.arange(len(X))
    incides = np.random.choice(indices, len(X))
    X = X[indices]
    y = y[indices]
    group = group[indices]
    
    # initialise stores for predictions, ground truths
    all_y_preds = []
    all_y_trues = []       
    # set outer fold 
    group_kfold = RandomGroupKFold_split(groups=group[:,0], n=k, seed=seed)

    # start outer fold
    j = 0
    for train_index, test_index in group_kfold:
        j += 1
        print('\n----------Outer Fold: {}----------'.format(j) )
        # extract the training and test data for predictors, outcome, and groups
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        g_train, g_test = group[train_index], group[test_index]

        # set inner group kfold
        nested_group_kfold = RandomGroupKFold_split(groups=g_train[:,0], n=nest_k, seed=seed)
        
        # initialise best_params which will store the best parameters
        best_params = None
        best_model = None
        
        # initialise best_acc (i.e. start with the worst possible and update)
        best_acc = 0.0
        # start grid search
        for n_estimator in n_estimators:
            for max_feature in max_features:
                print('Testing estimators and max depth:', n_estimator, max_feature)
                nest_y_preds = []
                nest_y_trues = []
                # start inner nested kfold loop
                l = 0
                for nest_train_index, nest_test_index in nested_group_kfold:
                    l += 1
                    print('Nested Fold:', l)
                    # extract the training and test data for predictors, outcome, and groups
                    X_train_nest, X_test_nest = X_train[nest_train_index], X_train[nest_test_index]
                    y_train_nest, y_test_nest = y_train[nest_train_index], y_train[nest_test_index]
                    g_train_nest, g_test_nest = g_train[nest_train_index], g_train[nest_test_index]
                    model = RandomForestClassifier(n_estimators=n_estimator,
                                               max_features=max_feature, 
                                               random_state=seed, class_weight='balanced')
                    model.fit(X_train_nest, y_train_nest)
                    y_preds_nest = model.predict(X_test_nest)
                    nest_y_preds.extend(y_preds_nest)
                    nest_y_trues.extend(y_test_nest)
                    

                bal_acc_nest = balanced_accuracy_score(nest_y_trues, nest_y_preds)
                if bal_acc_nest > best_acc:
                    best_acc = bal_acc_nest
                    best_model = model
                    best_params = [n_estimator, max_feature]
        
        print('Best Params: n_estimators', best_params[0], 'max depth', best_params[1])
        best_model_preds = best_model.predict(X_test)
        all_y_preds.extend(best_model_preds)
        all_y_trues.extend(y_test)
            
    bal_acc_runs.append(balanced_accuracy_score(all_y_trues, all_y_preds))


print('Balanced Acc Mean: ', np.mean(bal_acc_runs))
print('Balanced Acc Standard Error: ',sem(bal_acc_runs))



----------Outer Fold: 1----------
Testing estimators and max depth: 10 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 10 0.8
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 100 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 100 0.8
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 500 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 500 0.8
Nested Fold: 1
Nested Fold: 2
Best Params: n_estimators 10 max depth 0.7

----------Outer Fold: 2----------
Testing estimators and max depth: 10 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 10 0.8
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 100 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 100 0.8
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 500 0.7
Nested Fold: 1
Nested Fold: 2
Testing estimators and max depth: 500 0.8
Nested Fold: 1
Nested Fold: 2
Best Params: n_estimators 