In [9]:
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import RandomOverSampler 
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn import datasets

In [50]:

diabetes = datasets.load_diabetes()
X = diabetes.data[:200]
y = np.round(np.random.uniform(0,1, 200))

seed = 42
n_splits = 5
n_splits_nested = 2
n_trials = 5


class oversampled_Kfold():
    def __init__(self, n_splits, n_repeats=1):
        self.n_splits = n_splits
        self.n_repeats = n_repeats

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits*self.n_repeats

    def split(self, X, y, groups=None):
        splits = np.split(np.random.choice(len(X), len(X),replace=False), 5)
        train, test = [], []
        for repeat in range(self.n_repeats):
            for idx in range(len(splits)):
                trainingIdx = np.delete(splits, idx)
                ros = RandomOverSampler()
                Xidx_r, y_r = ros.fit_resample(trainingIdx.reshape(-1,1), y[trainingIdx])
                train.append(Xidx_r.flatten())
                test.append(splits[idx])
        return list(zip(train, test))


# output = cross_validate(clf,x,y, scoring=metrics,cv=rkf)


def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 500, log=True)
    trial_config = {'n_estimators': n_estimators, 'max_depth': max_depth}
    
    
    model = BalancedRandomForestClassifier(**trial_config)

    score = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=n_splits_nested)
    accuracy = score.mean()
    return accuracy


kf = oversampled_Kfold(n_splits=5, n_repeats=2)

kf_ = kf.split(X, y)

f = 0
preds = []
probs = []
GTs = []
for train_index, test_index in kf_:
    print('Running fold number: ', f)
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    if f == 0: 
        print('running optuna in first fold')
        study = optuna.create_study(study_name='test', direction='maximize')
        study.optimize(objective, n_trials=n_trials)
        config = study.best_trial.params
        print('bested optuna params:' , config)
        
    model = BalancedRandomForestClassifier(**config)
    model.fit(X_train, y_train)
    preds.append(model.predict(X_test))
    probs.append(model.predict_proba(X_test))
    GTs.append(y_test)
    f += 1
        
preds = np.concatenate(preds)
probs = np.concatenate(probs)
GTs = np.concatenate(GTs)

[32m[I 2021-08-17 11:35:44,063][0m A new study created in memory with name: test[0m


Running fold number:  0
running optuna in first fold


[32m[I 2021-08-17 11:35:45,132][0m Trial 0 finished with value: 0.5523809523809524 and parameters: {'n_estimators': 643, 'max_depth': 5}. Best is trial 0 with value: 0.5523809523809524.[0m
[32m[I 2021-08-17 11:35:46,175][0m Trial 1 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 626, 'max_depth': 5}. Best is trial 1 with value: 0.5714285714285714.[0m
[32m[I 2021-08-17 11:35:46,970][0m Trial 2 finished with value: 0.5809523809523809 and parameters: {'n_estimators': 440, 'max_depth': 2}. Best is trial 2 with value: 0.5809523809523809.[0m
[32m[I 2021-08-17 11:35:47,311][0m Trial 3 finished with value: 0.5619047619047619 and parameters: {'n_estimators': 245, 'max_depth': 484}. Best is trial 2 with value: 0.5809523809523809.[0m
[32m[I 2021-08-17 11:35:47,621][0m Trial 4 finished with value: 0.5571428571428572 and parameters: {'n_estimators': 218, 'max_depth': 9}. Best is trial 2 with value: 0.5809523809523809.[0m


bested optuna params: {'n_estimators': 440, 'max_depth': 2}
Running fold number:  1
Running fold number:  2
Running fold number:  3
Running fold number:  4
Running fold number:  5
Running fold number:  6
Running fold number:  7
Running fold number:  8
Running fold number:  9
