# Correct way of data preparation - train_test_split
### Avoid data leakage

In [12]:
# normalizing data in the correct approach is to first split the data before the model is evaluated
# i.e split the data; normalize the data; model; and evaluate - in sequence.

In [13]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)


In [15]:
X

array([[ 0.2929949 , -4.21223056, -1.288332  , ..., -4.43170633,
        -2.82646737,  0.44916808],
       [-0.06839901,  5.51884147, 11.2389773 , ..., -3.08994781,
         1.19029898,  1.62025622],
       [ 0.73161622, -0.68468633, -0.98174194, ...,  5.65429655,
        -0.64659866, -3.15652999],
       ...,
       [ 0.81230832,  0.29333773,  3.55727154, ...,  7.52278375,
        -4.50067701, -1.92525878],
       [ 2.62760166, -1.9607565 , -7.1050466 , ...,  0.02433393,
        -0.77573778,  4.04660465],
       [-0.97292653,  0.76166769,  3.98307684, ...,  0.85864477,
         2.406057  ,  2.33338943]])

In [16]:
y

array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,

In [17]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [18]:
# define the scalar
scaler = MinMaxScaler()


In [19]:
# scaling the train set by fit - fit X_train data
scaler.fit(X_train)

In [20]:
# transform the training set - scale data
X_train = scaler.transform(X_train)

In [21]:
X_train

array([[0.66353368, 0.42313244, 0.35968791, ..., 0.46286459, 0.48202531,
        0.66073356],
       [0.24711376, 0.44748852, 0.59397114, ..., 0.49325797, 0.42027169,
        0.63381649],
       [0.29588609, 0.56051868, 0.71327116, ..., 0.53066338, 0.56930581,
        0.44319873],
       ...,
       [0.64664942, 0.24677998, 0.60883541, ..., 0.46637897, 0.31790735,
        0.41494384],
       [0.40547646, 0.61748668, 0.72368188, ..., 0.2837755 , 0.89205399,
        0.40227081],
       [0.45134526, 0.36059728, 0.26342431, ..., 0.31527085, 0.66182204,
        0.4255061 ]])

In [22]:
# scale transform the test set - scale X_test
X_test = scaler.transform(X_test)

In [23]:
# model the data - fit the model
# define the model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [24]:
# evaluate the model
eval = lr_model.predict(X_test)

In [25]:
# score model predictions with accuracy
accuracy = accuracy_score(y_test, eval)
print('Accuracy: %.2f' % (accuracy * 100))


Accuracy: 85.50
