# Standard Scaling
Standard scaling reshapes the data to have mean 0 and standard deviation of 1. This is done by subtracting the column mean and dividing by the column standard deviation.

In [1]:
# The function scale provides a quick and easy way to perform this operation on a single array-like dataset:

from sklearn import preprocessing
import numpy as np

# Build dataset with three columns and three rows
# Structure is visually the same as a typical dataframe (i.e. columns are up-and-down and rows side-to-side)

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
print(X_train)
print(X_train.mean(axis=0)) #axis=0 refers to columns
print(X_train.std(axis=0))

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]
[1.         0.         0.33333333]
[0.81649658 0.81649658 1.24721913]


In [2]:
# Scale the data
X_scaled = preprocessing.scale(X_train)
print(X_scaled)
print(X_scaled.mean(axis=0))
print(X_scaled.std(axis=0))

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]
[0. 0. 0.]
[1. 1. 1.]


# StandardScaler()
The preprocessing module further provides a utility class StandardScaler that implements the Transformer API to compute the mean and standard deviation on a training set so as to be able to later reapply the same transformation on the testing set. This class is hence suitable for use in the early steps of a sklearn Pipeline:

In [3]:
# Here we set up the the standard scaler to the X_train data using fit()
scaler = preprocessing.StandardScaler().fit(X_train)

# Here we apply the fit standard scaler to the X_train data using transform()
scaler.transform(X_train)

# Print the means  per column
print(scaler.mean_)

[1.         0.         0.33333333]


In [4]:
# The scaler instance can then be used on new data to transform it the same way it did on the training set:
# Note that we are scaling new data to the scale built from the training data.

X_test = [[-1., 1., 0.]]
scaler.transform(X_test) # scalerTransform x_test before running a model

# It is also possible to disable either centering or scaling by either passing with_mean=False or with_std=False
# to the constructor of StandardScaler.

array([[-2.44948974,  1.22474487, -0.26726124]])

In [5]:
# Use transformed data with a model:
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(url)

# Split the data into features (X) and target variable (y)
X = data.drop('medv', axis=1)
y = data['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Scale the data
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.7380965524334823

# MinMaxScaler()
An alternative standardization is scaling features to lie between a given minimum and maximum value, often between zero and one, or so that the maximum absolute value of each feature is scaled to unit size. This can be achieved using MinMaxScaler.

The formula for a min-max transformation is as follows:

For each value in a column of X, subtract the minimium value of the column and divide the result by the the range of the column (max value minus min value).
`(Xi - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))`


In [6]:
# Here is an example to scale an example data matrix to the [0, 1] range:

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train) #fit_transform does both at once.  It's a little faster.
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [7]:
# And again we can then use the scaler to transform new data.
# Note once more that we are scaling new data to the scale built from the training data.

X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [8]:
# Use transformed data with a model:
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(url)

# Split the data into features (X) and target variable (y)
X = data.drop('medv', axis=1)
y = data['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.6893994143151715

# MaxAbsScaler()
MaxAbsScaler scales such that the training data lies within the range [-1, 1] by dividing through the largest maximum value in each feature. It is used for data that is already centered at zero.



In [9]:
# Use preprocessing.MaxAbsScaler() method from sklearn

from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(url)

# Split the data into features (X) and target variable (y)
X = data.drop('medv', axis=1)
y = data['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.MaxAbsScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.7774841640004668

# Using scikit-learn Pipelines

In [10]:
# Here is the code for a ridge model with the standard transformation...

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.6345884564889055

In [11]:
# Here is the much cleaner pipeline version:

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), Ridge())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.6345884564889055

# Pipelines and GridSearchCV


In [12]:
# We need to pay attention to names of pipeline steps when we use GridSearchCV

from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor

knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
print(knn_pipe.steps) # names in single quotes (i.e. 'standardscaler' and 'kneighborsregressor')

[('standardscaler', StandardScaler()), ('kneighborsregressor', KNeighborsRegressor())]


In [13]:
from sklearn.model_selection import GridSearchCV

knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

#refer to step name with two underscores before argument name when
#you build a parameter grid

param_grid = {'kneighborsregressor__n_neighbors': range(1, 10)}
grid = GridSearchCV(knn_pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.score(X_test, y_test))

{'kneighborsregressor__n_neighbors': 7}
0.5999825126971097


# Pipeline Steps, One-Hot Encoding, & Column Transformer

In [14]:
# Load the sample data
import pandas as pd
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

In [15]:
# Subset and split the data
from sklearn.model_selection import train_test_split

data = data[['Pclass','Sex','Age','Fare','Embarked','Survived']]
X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived', axis=1), data['Survived'])

X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
866,2,female,27.0,13.8583,C
460,1,male,48.0,26.55,S
658,2,male,23.0,13.0,S
49,3,female,18.0,17.8,S
543,2,male,32.0,26.0,S


In [16]:
# Preprocess data using sklearn's Column Transformer approach
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['Age', 'Fare']
categorical_features = ['Embarked', 'Sex', 'Pclass']

# Create steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #'imputer' is the name of the step
    ('scaler', StandardScaler())])

# Replace missing values with most frequent value, and then one-hot encode.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Final preprocessor object set up with ColumnTransformer.
preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit preprocessor to your data
preprocess = preprocess.fit(X_train)

In [17]:
# Turn it into a function that can transform other data using this fit

def preprocessor(data):
  preprocessed_data=preprocess.transform(data)
  return preprocessed_data

# You can and should also save your preprocessors just as you'd save your model and data to ensure consistency & reproducability.