# Hold-out based cross-validation

## Import libraries

In [1]:
import pandas as pd
from sklearn.datasets import make_regression

## Create the dataset

+ n_samples  = 1000
+ n_features = 10
+ n_targets  = 1

In [2]:
features, target = make_regression(n_samples=1000, n_features=10, n_targets=1, random_state=42)
features.shape, target.shape

((1000, 10), (1000,))

In [3]:
# Create dataframe out of data array
df = pd.DataFrame(data = features, columns=[f'f_{i}' for i in range(features.shape[1])])
df['target'] = target
df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target
0,1.317115,0.546734,-0.999302,-0.607822,-0.022868,1.296995,0.84062,-2.121855,-0.118069,-0.504775,90.396848
1,-0.093636,0.551741,-1.519346,-1.397118,1.038379,-0.583599,-0.451159,-1.287164,1.325797,-2.832156,-276.089545
2,-0.025027,-0.186971,-1.065114,0.55781,-1.311836,0.010353,-0.609512,1.390208,0.817766,-0.305225,-74.400308
3,-1.09862,0.072279,0.595491,0.221558,-1.031955,1.234752,-0.132169,-0.113481,1.420504,-0.589895,24.512224
4,0.175211,-1.130888,0.420094,-0.31353,0.482688,0.921802,2.056544,0.367482,2.985259,0.606851,126.362711


## Divide the dataset into the training set and the validation set

Let Training set vs validation set ratio be `7:3`

In [4]:
# shuffle the data using sample (frac=1)
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
df.head()

(1000, 11)


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target
0,-0.445795,-0.770814,-0.371462,0.243891,-0.392726,-1.192973,-0.980947,0.525937,-0.503722,-1.775982,-318.991749
1,-0.641081,0.910418,-1.216897,0.599578,0.415827,1.618788,-0.605156,1.026895,0.659054,1.363377,209.092608
2,-0.093636,0.551741,-1.519346,-1.397118,1.038379,-0.583599,-0.451159,-1.287164,1.325797,-2.832156,-276.089545
3,-0.916192,0.472002,-0.580053,-0.71576,1.487246,0.682052,0.499685,-0.067178,-0.831822,0.239405,51.430211
4,-0.531455,-1.299216,0.640543,2.511557,-0.032281,-1.840078,-0.113128,0.34671,-1.567859,0.123078,-186.125171


In [5]:
total_len = df.shape[0]
train_len = 0.7 * total_len
val_len = total_len - train_len

In [6]:
train_set = df.loc[:train_len-1]
val_set = df.loc[train_len:]

train_set.shape, val_set.shape

((700, 11), (300, 11))

In [7]:
# Now we have Training set and validation set with ratio of 7:3 data points
train_set.tail()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target
695,0.243657,-1.126054,0.494064,-0.969744,-0.626315,0.110535,-0.856852,-0.98554,-1.090208,-2.854627,-325.953752
696,0.756488,-0.279507,1.320414,-0.803948,-0.247752,0.307346,0.627606,0.659451,-1.91406,-0.874436,-5.861291
697,-0.317103,-0.380695,-0.98644,1.390644,0.103382,0.470014,2.15635,0.571224,-0.803434,0.516825,66.782452
698,-0.74049,0.231607,2.084713,0.298452,0.707032,-0.782599,-0.339759,0.151403,0.864824,-1.007912,-50.939252
699,-0.519396,-0.979721,0.078577,0.077156,-0.165631,-0.593811,0.458168,0.428317,-0.340908,-2.128734,-270.847516


In [8]:
val_set.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,target
700,0.537768,-1.170281,1.152648,0.518793,0.171469,0.863528,0.46795,-0.998307,0.306389,-1.217404,-59.040321
701,0.874517,0.181427,-0.769996,-1.042044,-0.351921,-0.487203,-0.456121,-1.203201,-0.649765,-1.296117,-150.423147
702,-1.918771,0.751933,-0.034712,2.463242,0.301547,-0.192361,1.142823,0.06023,-0.026514,-1.168678,-46.073872
703,-0.188202,0.037697,-1.270952,0.970316,0.220584,0.41876,0.898669,-0.213875,-0.103587,-0.37858,-15.960283
704,-1.242247,-0.557131,2.571995,-0.067871,0.739675,0.431632,0.608736,-1.09622,-1.125587,-2.249432,-138.413221
