# Data Leakage and its Avoidance

If our workflow is:
1. Prepare the data set (eliminate redundancy, impute, scale, ...)
2. Split the data set
3. Evaluate the model
then information from the test set contaminates the preparation and fitting of the training data

To avoid this the workflow should be:
1. Split the data set
2. **Prepare using the training data set**
3. **Apply the same preparation to the Training and Test data sets**
4. Evaluate the model


In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

### A synthetic data set

In [2]:
X, y = make_classification(n_samples=1000, n_features=20, 
                           n_informative=15, n_redundant=5, 
                           random_state=7)
X.shape, y.shape

((1000, 20), (1000,))

### Workflow with data leakage

In [3]:
scaler = MinMaxScaler()

# !!! fitting before the split
X = scaler.fit_transform(X) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

model = LogisticRegression()
model.fit(X_train, y_train)

yhat = model.predict(X_test)

accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % accuracy)


Accuracy: 0.848


### Correct workflow avoiding data leakage

In [4]:
# split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# scale the test dataset
X_test = scaler.transform(X_test)
model = LogisticRegression()

model.fit(X_train, y_train)

yhat = model.predict(X_test)

accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % accuracy)

Accuracy: 0.855


### Data Leakage with k-fold CV

In [5]:
X, y = make_classification(n_samples=1000, n_features=20, 
                           n_informative=15, n_redundant=5, 
                           random_state=7)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

model = LogisticRegression()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv ) #, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))

Accuracy: 0.853 (0.036)


In [None]:
scores, len(scores)

(array([0.86, 0.91, 0.88, 0.81, 0.83, 0.84, 0.81, 0.84, 0.88, 0.84, 0.84,
        0.86, 0.85, 0.83, 0.89, 0.87, 0.79, 0.97, 0.84, 0.84, 0.81, 0.88,
        0.8 , 0.85, 0.89, 0.88, 0.87, 0.83, 0.83, 0.87]),
 30)

### Use a Pipeline to avoid data leakage during k-fold CV

In [6]:
from sklearn.pipeline import Pipeline

X, y = make_classification(n_samples=1000, n_features=20, 
                           n_informative=15, n_redundant=5, 
                           random_state=7)

pipe = Pipeline(steps = 
                [('scaler', MinMaxScaler()),
                 ('model', LogisticRegression())])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) #ensures variable proportions are preserved in train/test. like men/women. removes bias.

scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))

Accuracy: 0.854 (0.035)
