# Data preprocessing

## Binarizer

In [1]:
from sklearn.preprocessing import Binarizer

X = [[1.0],
     [-1.0],
     [1.0],
     [-1.0]]

t = Binarizer()
t.fit(X)
t.transform(X)

array([[1.],
       [0.],
       [1.],
       [0.]])

## K-bins discretizer

In [2]:
from sklearn.preprocessing import KBinsDiscretizer

X = [[-2.0, 2.0],
     [-1.0, 1.0],
     [0.0, 0.0],
     [1.0, -1.0], 
     [2.0, -2.0]]

t = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
t.fit(X)
t.transform(X)

array([[0., 2.],
       [0., 2.],
       [1., 1.],
       [2., 0.],
       [2., 0.]])

## Label binarizer

In [3]:
from sklearn.preprocessing import LabelBinarizer

X = [0, 0, 2, 2, 1, 1]

t = LabelBinarizer()
t.fit([0, 1, 2])

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [4]:
o = t.transform(X)
o

array([[1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0]])

In [5]:
t.inverse_transform(o)

array([0, 0, 2, 2, 1, 1])

## Label encoder

In [6]:
from sklearn.preprocessing import LabelEncoder

X = [99, 100, 200]

t = LabelEncoder()
t.fit(X)

LabelEncoder()

In [7]:
o = t.transform([99, 99, 100, 200, 100, 200])
o

array([0, 0, 1, 2, 1, 2])

In [8]:
t.inverse_transform(o)

array([ 99,  99, 100, 200, 100, 200])

## Multi-label binarizer

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

X = [[3, 4], [5], [6]]

t = MultiLabelBinarizer()
t.fit(X)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [10]:
o = t.transform(X)
o

array([[1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

In [11]:
t.inverse_transform(o)

[(3, 4), (5,), (6,)]

## Maximum absolute scaler

In [12]:
from sklearn.preprocessing import MaxAbsScaler

X = [[0, -1, 0],
     [1, 0, -1],
     [2, 1, 2]]

t = MaxAbsScaler()
t.fit(X)

MaxAbsScaler(copy=True)

In [13]:
o = t.transform(X)
o

array([[ 0. , -1. ,  0. ],
       [ 0.5,  0. , -0.5],
       [ 1. ,  1. ,  1. ]])

In [14]:
t.inverse_transform(o)

array([[ 0., -1.,  0.],
       [ 1.,  0., -1.],
       [ 2.,  1.,  2.]])

## Min-max scaler

In [15]:
from sklearn.preprocessing import MinMaxScaler

X = [[0, 10, 0],
     [1, 5, -1],
     [2, 0, -2]]

t = MinMaxScaler()
t.fit(X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [16]:
o = t.transform(X)
o

array([[0. , 1. , 1. ],
       [0.5, 0.5, 0.5],
       [1. , 0. , 0. ]])

In [17]:
t.inverse_transform(o)

array([[ 0., 10.,  0.],
       [ 1.,  5., -1.],
       [ 2.,  0., -2.]])

## Normalizer

In [18]:
from sklearn.preprocessing import Normalizer

X = [[1, 2, 3, 4],
     [5, 6, 7, 8],
     [9, 10, 11, 12]]

t = Normalizer()
t.fit(X)

Normalizer(copy=True, norm='l2')

In [19]:
t.transform(X)

array([[0.18257419, 0.36514837, 0.54772256, 0.73029674],
       [0.37904902, 0.45485883, 0.53066863, 0.60647843],
       [0.42616235, 0.47351372, 0.5208651 , 0.56821647]])

## One-hot encoder

In [20]:
from sklearn.preprocessing import OneHotEncoder

X = [['boy', 2],
     ['girl', 1],
     ['boy', 3],
     ['girl', 4]]

t = OneHotEncoder(handle_unknown='ignore')
t.fit(X)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [21]:
o = t.transform(X).todense()
o

matrix([[1., 0., 0., 1., 0., 0.],
        [0., 1., 1., 0., 0., 0.],
        [1., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 1.]])

In [22]:
t.inverse_transform(o)

array([['boy', 2],
       ['girl', 1],
       ['boy', 3],
       ['girl', 4]], dtype=object)

## Ordinal encoder

In [23]:
from sklearn.preprocessing import OrdinalEncoder

X = [['boy', 2],
     ['girl', 1],
     ['boy', 3],
     ['girl', 4]]

t = OrdinalEncoder()
t.fit(X)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [24]:
o = t.transform(X)
o

array([[0., 1.],
       [1., 0.],
       [0., 2.],
       [1., 3.]])

In [25]:
t.inverse_transform(o)

array([['boy', 2],
       ['girl', 1],
       ['boy', 3],
       ['girl', 4]], dtype=object)

## Polynomial features

In [26]:
from sklearn.preprocessing import PolynomialFeatures

X = [[0, 1],
     [2, 3],
     [4, 5]]

t = PolynomialFeatures(2)
t.fit(X)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [27]:
t.transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

## Power transformer

In [28]:
from sklearn.preprocessing import PowerTransformer

X = [[0, 1],
     [2, 3],
     [4, 5]]

t = PowerTransformer()
t.fit(X)

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [29]:
o = t.transform(X)
o

array([[-1.28608295, -1.26755013],
       [ 0.13363693,  0.09064754],
       [ 1.15244602,  1.17690259]])

In [30]:
t.inverse_transform(o)

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

## Robust scaler

In [31]:
from sklearn.preprocessing import RobustScaler

X = [[1.,-2.,2.],
     [ -2.,1.,3.],
     [ 4.,1.,-2.]]

t = RobustScaler(with_centering=False)
t.fit(X)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=False,
             with_scaling=True)

In [32]:
o = t.transform(X)
o

array([[ 0.33333333, -1.33333333,  0.8       ],
       [-0.66666667,  0.66666667,  1.2       ],
       [ 1.33333333,  0.66666667, -0.8       ]])

In [33]:
t.inverse_transform(o)

array([[ 1., -2.,  2.],
       [-2.,  1.,  3.],
       [ 4.,  1., -2.]])

## Standard scaler

In [34]:
from sklearn.preprocessing import StandardScaler

X = [[0, 0], 
     [0, 0], 
     [1, 1], 
     [1, 1]]

t = StandardScaler()
t.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [35]:
o = t.transform(X)
o

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [36]:
t.inverse_transform(o)

array([[0., 0.],
       [0., 0.],
       [1., 1.],
       [1., 1.]])