# Data preprocessing with sklearn
After preprocessing: lower case = scaling function, corresponding upper case = API scaler

In [1]:
from sklearn import preprocessing 
import numpy as np

### 1. Standardization 
Scale data to mean and var

In [4]:
X_train = np.array([[1., -1.,  2.],
                    [2.,  0.,  0.],
                    [0.,  1., -1.]])

In [6]:
X_scaled = preprocessing.scale(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [7]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

In [9]:
scaler.var_

array([0.66666667, 0.66666667, 1.55555556])

In [10]:
scaler.transform(X_train)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [11]:
X_test = [[-1., 1., 0.]]
scaler.transform(X_test)

array([[-2.44948974,  1.22474487, -0.26726124]])

In [20]:
scaler_1 = preprocessing.StandardScaler().fit(X_test)
scaler_1.transform(X_test)

array([[0., 0., 0.]])

### 2. Scaling features to a range
Scale data based on its min and max

In [13]:
min_max_scaler = preprocessing.MinMaxScaler()

In [21]:
X_train_minmax_1 = min_max_scaler.fit(X_train)
X_train_minmax_1

MinMaxScaler(copy=True, feature_range=(0, 1))

In [22]:
X_train_minmax_1.transform(X_train)

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [23]:
max_abs_scaler = preprocessing.MaxAbsScaler()

In [25]:
X_train_maxabs = max_abs_scaler.fit(X_train)
X_train_maxabs

MaxAbsScaler(copy=True)

In [26]:
X_train_maxabs.transform(X_train)

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

### 3. Normalization
* Scale data based on unit norm
* Use for quadratic form such as dot-product or any other kernel to quantify similarity of any pair of sample

In [29]:
X_normalized = preprocessing.normalize(X_train, norm = 'l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [30]:
normalizer = preprocessing.Normalizer()
normalizer.fit(X_train)

Normalizer(copy=True, norm='l2')

In [31]:
normalizer.transform(X_train)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

### 4. BInarization
Turn data into Bernoulli boolean distribution wherether the data excess a threshold value

In [32]:
binarize = preprocessing.Binarizer()
binarize.fit(X_train)

Binarizer(copy=True, threshold=0.0)

In [34]:
X_train_binarize = binarize.transform(X_train)
X_train_binarize

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

### 5. Encoding categorical features

#### 5.1. OrdinalEncoder 

In [35]:
enco = preprocessing.OrdinalEncoder()

In [41]:
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox'],
     ['child', 'from Asia', 'use CocCoc']]

In [42]:
enco.fit(X)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [43]:
enco.transform([['female', 'from US', 'uses Safari']])

array([[1., 2., 2.]])

In [44]:
enco.transform(X)

array([[2., 2., 2.],
       [1., 1., 1.],
       [0., 0., 0.]])

In [45]:
label_lv = [['challanger'],
            ['diamond'],
            ['platium'],
            ['silver']]

In [46]:
rank_converter = preprocessing.OrdinalEncoder().fit(label_lv)

In [51]:
rank_converter.transform([['diamond']])

array([[1.]])

#### 5.2. OneHotEncoder 

In [52]:
enc = preprocessing.OneHotEncoder()

In [53]:
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]

In [59]:
enc.fit_transform(X).toarray()

array([[0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])