In [None]:
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import itertools
import time
import sklearn.datasets
from skopt import BayesSearchCV as bayes_opt
from sklearn.tree import DecisionTreeClassifier as dt


#sample data for sample execution
# X,Y = sklearn.datasets.make_classification(n_samples=1000)

In [None]:
X = pd.read_csv("/home/administrtor/Desktop/TBB/P/docker/rf/data/occupancy_data/datatraining.csv") 
x=X.drop(['date','Occupancy'],axis=1)
y=X['Occupancy']

In [None]:
dt_space = {'splitter':['best','random'],
          'max_depth': (1,100),
          'criterion':['gini','entropy'],
          'min_samples_split':(1,1000)}


scoring = {'AUC': 'roc_auc', 
           'avg_pr': 'average_precision', 
           'bal_accuracy':'balanced_accuracy',
           'accuracy':'accuracy',
           'f1_score':'f1'}

In [None]:
def inner_cv(X, y, n_folds, spaces, n_iter ,seed, scoring ):
    
        """Funtion to do Hyperparameter tuning using Stratified CV. use this function as Inner loop in nested CV. 
        Takes following Arguments.
            1. X: Features Data without labels
            2. Y: Labels of the Data
            3. n_folds: Parameter to specify number of folds in CV
            4. spaces: Usually  a Disctionary object of Hyperparameter space to perform Bayesian Search CV
            5. seed: Random seed to reproduce the results. Give seed as the loop number When using Nested CV
            6. scoring: Dictionary object of multiple evaluation metrics to monitor for BayesSearchCV. 
                        if None provided, average_precision is assumed.
            7. n_iter: Number of iterations of Hyperparameter combination search."""
        
        estimator = dt(random_state = seed)

        cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=seed+1)

        optimization = bayes_opt(cv = cv, estimator = estimator, n_iter = n_iter, n_jobs = -1, 
                                 search_spaces=spaces, n_points=100, random_state = seed+2, 
                                 error_score=-1, return_train_score=True, verbose=0, 
                                 iid = True, refit = True, scoring = scoring)
        
        model = optimization.fit(X=X,y=y)
        print("CV score: ",model.best_score_)
        return(model)

In [6]:
def outer_cv(X,y, n_fold, spaces, seed, scoring):
    cv = StratifiedKFold(n_splits = n_fold, shuffle=False,random_state = seed)
    
    models = []
    test_scores = {'average_precision':[],'roc_auc':[],'balanced_accuracy':[],'f1':[],'accuracy':[]}
    mean_cv_scores = []
    random_state = 1+seed
    for train_index, test_index in cv.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index] 
        model = inner_cv(X=X_train, y=y_train, n_folds=4, n_iter=10, spaces = spaces, scoring = scoring, 
                         seed = random_state)
        random_state +=1
        models.append(model)
        mean_cv_scores.append(model.best_score_)
        
        preds = model.predict_proba(X_test)[:,1]
        
        test_score = metrics.average_precision_score(y_true = y_test, y_score=preds)
        test_scores['average_precision'].append(test_score)
        
        test_score = metrics.roc_auc_score(y_true = y_test, y_score=preds)
        test_scores['roc_auc'].append(test_score)
        
        test_score = metrics.balanced_accuracy_score(y_true = y_test, y_pred=preds>0.5)
        test_scores['balanced_accuracy'].append(test_score)
        
        test_score = metrics.f1_score(y_true = y_test, y_pred=preds>0.5)
        test_scores['f1'].append(test_score)
        
        test_score = metrics.accuracy_score(y_true = y_test, y_pred=preds>0.5)
        test_scores['accuracy'].append(test_score)
        
    return(models,test_scores,mean_cv_scores)


test_models,test_model_scores, mean_cvs_scores = outer_cv(X=x, y=y, n_fold = 5, 
                                                          spaces = dt_space, seed = 42, 
                                                          scoring = 'average_precision')

print(test_model_scores)

KeyError: "None of [Int64Index([1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254,\n            ...\n            8133, 8134, 8135, 8136, 8137, 8138, 8139, 8140, 8141, 8142],\n           dtype='int64', length=6514)] are in the [columns]"

In [5]:
for i in test_models:print(i.best_params_)

{'criterion': 'entropy', 'max_depth': 23, 'min_samples_split': 26, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 67, 'min_samples_split': 525, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 72, 'min_samples_split': 183, 'splitter': 'random'}
{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 48, 'splitter': 'best'}
{'criterion': 'entropy', 'max_depth': 59, 'min_samples_split': 135, 'splitter': 'best'}


In [12]:
np.std(mean_cvs_scores)

0.007622392065484313

In [13]:
np.mean(mean_cvs_scores)

0.9307410782586869

In [16]:

type(X)

numpy.ndarray

In [20]:
X.dtype.names

In [21]:
X

array([[ 0.3104474 ,  2.62499275,  0.10777455, ...,  0.03141631,
         0.14086748,  1.03520031],
       [ 0.43498062,  1.01402426,  1.50164207, ..., -0.38271714,
        -0.71683825, -0.81214732],
       [ 0.66190062, -1.57875257, -1.37461602, ..., -0.70859936,
         1.61264049, -1.08397396],
       ...,
       [-0.52681439, -1.2144743 , -0.52374396, ...,  0.95846304,
        -0.72889868,  1.0011608 ],
       [ 0.36123221,  0.43004262,  1.02955141, ...,  2.45343822,
        -0.34193097,  1.48951677],
       [ 0.39086115, -1.86639563,  1.66443737, ...,  1.25522852,
         0.71665287,  0.84072647]])