# Example of using cross validation

In [2]:
# packages
import pandas as pd

In [16]:
# get data about red wine
df = pd.read_csv("../../data/winequality-red.csv")

In [19]:
# k-folds

from sklearn import model_selection

# new column
df["kfold"] = -1

# randomise the rows
df = df.sample(frac=1).reset_index(drop=True)

# initiate kfold class from model_selection module
kf = model_selection.KFold(n_splits=5)

# populate new column
for fold, (trn_, val_) in enumerate(kf.split(X=df)):
    df.loc[val_, 'kfold'] = fold

In [20]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold
0,7.0,0.655,0.16,2.1,0.074,8.0,25.0,0.99606,3.37,0.55,9.7,5,0
1,7.8,0.5,0.17,1.6,0.082,21.0,102.0,0.996,3.39,0.48,9.5,5,0
2,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6,5,0
3,8.9,0.745,0.18,2.5,0.077,15.0,48.0,0.99739,3.2,0.47,9.7,6,0
4,5.1,0.585,0.0,1.7,0.044,14.0,86.0,0.99264,3.56,0.94,12.9,7,0


In [21]:
df.kfold.unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [22]:
# stratified k-folds

df["skfold"] = -1

# randomise the rows
df = df.sample(frac=1).reset_index(drop=True)

# get target values
y = df.quality.values

# initiate kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# populate new column
for fold, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
    df.loc[val_, 'skfold'] = fold

In [23]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold,skfold
0,9.5,0.46,0.49,6.3,0.064,5.0,17.0,0.9988,3.21,0.73,11.0,6,2,0
1,11.0,0.2,0.48,2.0,0.343,6.0,18.0,0.9979,3.3,0.71,10.5,5,2,0
2,7.7,0.43,0.25,2.6,0.073,29.0,63.0,0.99615,3.37,0.58,10.5,6,4,0
3,6.9,0.63,0.01,2.4,0.076,14.0,39.0,0.99522,3.34,0.53,10.8,6,0,0
4,7.1,0.34,0.28,2.0,0.082,31.0,68.0,0.99694,3.45,0.48,9.4,5,1,0


For this particular dataset, stratified k-folds is the correct cross validation approach to use since the quality of wines (assuming quality is our target) is not evenly distributed.