# Robust Scaling

## Scaling without being affected by outliers
1. Subtract the median from each value.
2. Scale to the IQR.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

In [13]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [5]:
data = np.array([
    [1, 2, 3, 3, 2, 1, 4, 5, 6, 5, 5, 3, 4],
    [100, 200, 400, 200, 250, 350, 600, 350, 700, 650, 400, 470, 500]
])
data = data.transpose()
data

array([[  1, 100],
       [  2, 200],
       [  3, 400],
       [  3, 200],
       [  2, 250],
       [  1, 350],
       [  4, 600],
       [  5, 350],
       [  6, 700],
       [  5, 650],
       [  5, 400],
       [  3, 470],
       [  4, 500]])

In [98]:
robust_scaler = RobustScaler()
scaled_info = robust_scaler.fit_transform(data)
# np.quantile(scaled_info, 3 / 4, axis=0)
# - np.quantile(scaled_info, 1 / 4, axis=0)
scaled_info.max(axis=0), scaled_info.min(axis=0), scaled_info

(array([1. , 1.2]),
 array([-0.66666667, -1.2       ]),
 array([[-0.66666667, -1.2       ],
        [-0.33333333, -0.8       ],
        [ 0.        ,  0.        ],
        [ 0.        , -0.8       ],
        [-0.33333333, -0.6       ],
        [-0.66666667, -0.2       ],
        [ 0.33333333,  0.8       ],
        [ 0.66666667, -0.2       ],
        [ 1.        ,  1.2       ],
        [ 0.66666667,  1.        ],
        [ 0.66666667,  0.        ],
        [ 0.        ,  0.28      ],
        [ 0.33333333,  0.4       ]]))

In [71]:
q_1 = np.quantile(data, 1 / 4, axis=0)
med = np.quantile(data, 2 / 4, axis=0)
q_3 = np.quantile(data, 3 / 4, axis=0)
q_1, med, q_3

(array([  2., 250.]), array([  3., 400.]), array([  5., 500.]))

In [130]:
new_data = np.ndarray(shape=data.shape)
for i in range(data.shape[-1]):
    cut_data = data[..., i]
    iqr = q_3[i] - q_1[i]
    cut_data = cut_data.reshape(-1, 1)
    new_range = (q1, q3)
    print(iqr)
    c_data_scaled = np.array((cut_data - med[i]) / iqr)
#     new_data[..., i] = c_data_scaled[..., 0]
    new_data[..., i] = c_data_scaled.reshape((data.shape[0],))
print(new_data)

3.0
250.0
[[-0.66666667 -1.2       ]
 [-0.33333333 -0.8       ]
 [ 0.          0.        ]
 [ 0.         -0.8       ]
 [-0.33333333 -0.6       ]
 [-0.66666667 -0.2       ]
 [ 0.33333333  0.8       ]
 [ 0.66666667 -0.2       ]
 [ 1.          1.2       ]
 [ 0.66666667  1.        ]
 [ 0.66666667  0.        ]
 [ 0.          0.28      ]
 [ 0.33333333  0.4       ]]
