# Correct way of data preparation - cross validation
### Avoid data leakage

In [14]:
# correct data preparation for model evaluation with k-fold cross-validation
# imports
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [15]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)


_"Data preparation without data leakage when using cross-validation is slightly more challenging.
It requires that the data preparation method is prepared on the training set and applied to the
train and test sets within the cross-validation procedure, e.g. the groups of folds of rows. We
can achieve this by *defining a modeling pipeline that defines a sequence of data preparation
steps to perform* and ending in the model to fit and evaluate."_ - data preparation for machine learning, Jason Brownlee pg 34


In [17]:
# define the pipeline
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)

In [18]:
# evaluation procedures using RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [19]:
cv

RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1)

In [20]:
# Model evaluation using cross validation
eval = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [21]:
eval

array([0.86, 0.91, 0.87, 0.81, 0.83, 0.84, 0.81, 0.84, 0.88, 0.84, 0.84,
       0.86, 0.85, 0.83, 0.89, 0.88, 0.8 , 0.97, 0.84, 0.84, 0.81, 0.88,
       0.81, 0.85, 0.89, 0.88, 0.87, 0.84, 0.84, 0.87])

In [22]:
# performance reporting
print('Accuracy: %.2f (%.2f)' % (mean(eval)*100, std(eval)*100))

Accuracy: 85.43 (3.47)


"Running the example normalizes the data correctly within the cross-validation folds of the
evaluation procedure to avoid data leakage."