# Stratified KFold Cross validation

## Import libraries

In [28]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_regression
from sklearn.model_selection import StratifiedKFold

## Create the dataset for regression

+ Samples = 15000
+ Features = 15
+ Target = 1

In [29]:
X, y = make_regression(n_samples=15000, n_features=15, n_targets=1, random_state=42)

print(X.shape, y.shape)

(15000, 15) (15000,)


In [30]:
# Create DataFrame out of Numpy arrays
df = pd.DataFrame(X, columns = [f'f_{i+1}' for i in range(X.shape[1])])
df.loc[:, 'target'] = y

df.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,target
0,-0.236584,1.576358,0.696042,1.512049,0.635228,0.056872,0.985621,0.244609,1.298732,1.318061,0.516612,1.004831,0.001747,0.592848,1.286301,320.86517
1,-0.945243,1.225928,0.406758,-1.124507,0.02685,-1.290316,-0.962637,0.402836,-1.721356,0.108272,0.511595,0.795818,-2.011014,0.362332,-0.003083,-146.96045
2,-1.802891,1.517912,-0.597097,-1.308712,-0.432225,-0.046989,0.728738,-0.031939,-0.345655,-0.037924,0.401426,1.12662,0.120176,0.18731,0.200815,-33.300111
3,-1.101839,0.764034,2.204999,0.314847,1.078584,-1.614898,0.116136,0.565243,0.135688,-2.0704,0.868964,-0.437597,1.268543,-1.197479,-0.011393,137.267936
4,0.170064,0.001013,-0.571043,-0.563529,1.204455,-0.457567,0.369362,-0.392459,0.5536,-0.006358,1.086829,0.603846,-0.732885,0.263391,0.205743,-8.26843


In [31]:
def create_folds(data):
    # Add column 'kfold' and assign -1 to all rows
    data['kfold'] = -1

    # Shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)

    # Find n_bins using Sturge's rule: n_bins = 1 + log2(N) where N = number of samples
    n_bins = int(np.floor(1 + np.log2(data.shape[0])))
    print('Number of bins = {}'.format(n_bins))

    # Divide targets into "bins"
    data.loc[:, 'bins'] = pd.cut(df['target'], bins=n_bins, labels = False)

    # Create StratifiedKFold object
    kf = StratifiedKFold(n_splits=5)

    # Fill the kfold column
    for fold_, (training_, validation_) in enumerate(kf.split(X=data, y=data['bins'])):
        data.loc[validation_, 'kfold'] = fold_
    
    # Drop bins column
    data.drop('bins', axis=1, inplace=True)

    # Return dataframe
    return data

In [32]:
df_2 = create_folds(df.copy())

Number of bins = 14


In [33]:
df_2.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,target,kfold
0,-0.893224,-0.373196,1.417868,-0.549829,-0.737362,-1.969256,-0.167549,1.059193,-1.47584,1.99905,-0.305777,1.483075,-1.56879,0.788723,-1.680943,-250.74213,0
1,-0.879132,0.742245,0.886591,-1.560476,-1.081319,0.980109,-0.160001,0.966173,0.285961,0.545167,1.113768,1.486477,1.434749,-1.045364,-0.155307,-4.03015,0
2,-0.236508,1.277915,1.490967,1.73348,-1.849484,0.780374,-1.32245,0.391116,-2.121277,0.089028,0.267048,0.730495,-0.200168,-1.366043,-0.463169,24.458284,0
3,1.281644,1.66147,-0.616361,0.020886,-0.175854,-0.103255,-0.375196,-1.600904,-0.317715,0.557691,-1.260165,1.489863,-1.643189,0.230701,-0.828497,-200.097603,0
4,0.031042,-0.456549,-0.379796,0.550247,-0.462288,-0.067089,-1.01801,0.50779,-0.059267,0.682241,0.395908,-1.162657,0.471239,-0.661309,0.47769,62.243694,0


In [34]:
df_2['kfold'].value_counts()

4    3000
3    3000
2    3000
1    3000
0    3000
Name: kfold, dtype: int64