In [74]:
# The following function will evaluate a model using n-fold & n-fold with Stratification
# Functions names 'n_fold' and 'n_fold_strat'
# classifier is the instance of the model for classification
# X is the dataset excluding y
# folds is the number of folds
# Metrics are the performance metrics to be returned
# Use : metrics = ['roc_auc','accuracy','precision']
# X is the entire set of independant variables / feature set, y is the dependent variable for model prediction

def n_fold(classifier, X, y, folds, metrics):
    
    import pandas as pd
    from sklearn.model_selection import cross_validate
    sum_res = {}
    
    
    outcomes = cross_validate(classifier, X, y, scoring=metrics, cv=folds, return_train_score=True)
    
    sum_res['train_auc'] = outcomes.get('train_roc_auc').sum()/folds
    sum_res['test_auc'] = outcomes.get('test_roc_auc').sum()/folds
    sum_res['train_accuracy'] = outcomes.get('train_accuracy').sum()/folds
    sum_res['test_accuracy'] = outcomes.get('test_accuracy').sum()/folds
    sum_res['train_precision'] = outcomes.get('train_precision').sum()/folds
    sum_res['test_precision'] = outcomes.get('test_precision').sum()/folds
   
    return outcomes, sum_res

# --------------

def n_fold_strat (classifier, X, y, folds, metrics):
    
    
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_validate
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler

    pipeline = make_pipeline(StandardScaler(), classifier)
    stratified_kfold = StratifiedKFold(n_splits=folds)
    outcomes = cross_validate(pipeline, X, y, scoring=metrics, cv=stratified_kfold, return_train_score=True)
    
    sum_res_s = {}
    
    sum_res_s['train_auc'] = outcomes.get('train_roc_auc').sum()/folds
    sum_res_s['test_auc'] = outcomes.get('test_roc_auc').sum()/folds
    sum_res_s['train_accuracy'] = outcomes.get('train_accuracy').sum()/folds
    sum_res_s['test_accuracy'] = outcomes.get('test_accuracy').sum()/folds
    sum_res_s['train_precision'] = outcomes.get('train_precision').sum()/folds
    sum_res_s['test_precision'] = outcomes.get('test_precision').sum()/folds
    
    return outcomes, sum_res_s    
    

In [75]:
# Example of using n_fold function
metrics = ['roc_auc','accuracy','precision']
folds = 10


det_res, sum_res = n_fold(classifier, X_train, y_train, folds, metrics)
det_res_stat, sum_res_strat = n_fold_strat(classifier, X, y, folds, metrics)

In [76]:
sum_res

{'train_auc': 0.9972052465842935,
 'test_auc': 0.9955460214388786,
 'train_accuracy': 0.9903186956883238,
 'test_accuracy': 0.9886507450500103,
 'train_precision': 1.0,
 'test_precision': 1.0}

In [77]:
sum_res_strat

{'train_auc': 0.9973595015835086,
 'test_auc': 0.9965238095238096,
 'train_accuracy': 0.9923334547326602,
 'test_accuracy': 0.99,
 'train_precision': 0.9966150594273436,
 'test_precision': 0.9966666666666667}

{'train_auc': 0.9955460214388786}

In [1]:
# Setup to run example

import pandas as pd
import numpy as np



# As an example we will utilize Logistic regression

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='liblinear')

from sklearn.datasets import make_classification

X,y = make_classification(n_samples=1000, n_features=10,
                               n_informative=2, n_redundant=0, n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1,
                               weights=(0.7,0.3),
                               class_sep=0.99, random_state=14)


In [4]:

from sklearn.model_selection import train_test_split

# You already know about training and test splits:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)