# Data Leakage

In [None]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

## Data Preparation With Train and Test Sets

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

**Train-Test Evaluation With Naive Data Preparation**

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, yhat)

In [None]:
print(f'{accuracy*100: 0.3f}%')

**Train-Test Evaluation With Correct Data Preparation**

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, yhat)
print(f'{accuracy*100: .3f}%')

## Data Preparation With k-fold Cross-Validation

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

**Cross-Validation Evaluation With Naive Data Preparation**

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
scaler = MinMaxScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
model = LogisticRegression()

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

In [None]:
print(f'{scores.mean()*100: .3f} ({scores.std()*100: .3f})')

 85.300 ( 3.607)


**Cross-Validation Evaluation With Correct Data Preparation**

In [None]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))

In [None]:
pipeline = Pipeline(steps=steps)

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)

In [None]:
print(f'{scores.mean()*100: .3f} ({scores.std()*100:.3f})')

 85.433 (3.471)
