In [1]:
from rCVrs import *

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

import pandas as pd
import numpy as np

from copy import copy

import datetime

In [2]:
data_prod_params_base = {'n_samples':500, 'shuffle':True, 'random_state':1234, 'factor':0.6}

In [3]:
forest_hyperparams = {
    'forest_hyperparams_0':{'max_depth': 7, 'max_features': 1.0,    'max_leaf_nodes': 17, 'min_samples_leaf': 11, 'n_estimators': 3,  'n_jobs': -1, 'class_weight': 'balanced',           'min_samples_split': 8, 'criterion': 'entropy'},
    'forest_hyperparams_1':{'max_depth': 8, 'max_features': 'sqrt', 'max_leaf_nodes': 7,  'min_samples_leaf': 91, 'n_estimators': 9,  'n_jobs': -1, 'class_weight': 'balanced',           'min_samples_split': 2, 'criterion': 'entropy'},
    'forest_hyperparams_2':{'max_depth': 4, 'max_features': 1.0,    'max_leaf_nodes': 9,  'min_samples_leaf': 1,  'n_estimators': 27, 'n_jobs': -1, 'class_weight': 'balanced',           'min_samples_split': 2, 'criterion': 'gini'},
    'forest_hyperparams_3':{'max_depth': 9, 'max_features': 'log2', 'max_leaf_nodes': 18, 'min_samples_leaf': 11, 'n_estimators': 9,  'n_jobs': -1, 'class_weight': 'balanced_subsample', 'min_samples_split': 2, 'criterion': 'entropy'},
}

In [4]:
results = []

for noise in [0.4, 0.5, 0.6]:
    print('noise', noise, datetime.datetime.now())
    # create data with different lvls of noise
    data_params_tmp = copy(data_prod_params_base)
    data_params_tmp['noise'] = noise
    X, y = make_circles(**data_params_tmp)
        
    for set_seed_forest in [True, False]:        
        for forest_name, forest_hyperp in forest_hyperparams.items():
            # create forests with different random seeds and different
            # hyperparams versions
            clf = RandomForestClassifier(**forest_hyperp)
            
            if set_seed_forest:
                clf.random_state = 1234
            else:
                pass

            for shuffle in [True, False]:
                for x in range(60):
                    splitter = StratifiedKFold(n_splits=5, shuffle=shuffle)
                    scores = cross_val_score(clf, X, y, cv=splitter, n_jobs=-1, scoring='roc_auc')

                    results.append({
                        'data_noise':noise,
                        'forest_seed':set_seed_forest,
                        'forest_name':forest_name,
                        'shuffle':shuffle,
                        'scores':scores,
                    })

noise 0.4 2019-10-02 14:01:34.485936
noise 0.5 2019-10-02 14:08:41.137242
noise 0.6 2019-10-02 14:15:47.147821


In [5]:
res_df = pd.DataFrame(results)
res_df['mean'] = res_df['scores'].map(lambda x: np.mean(x))
res_df['std'] = res_df['scores'].map(lambda x: np.std(x))

In [9]:
res_df.to_csv('res_raw.csv', index=False)

In [10]:
res_grouped = res_df.groupby(by=['data_noise','forest_name','forest_seed', 'shuffle']).std().reset_index()
res_grouped.rename({'mean':'std(mean)', 'std':'std(std)'}, axis=1).to_csv('res_grouped.csv', index=False)