## Центрируем наши данные вблизи нуля (т.е. среднее равно 0). MSE (как и дисперсия) равны нулю. Нужно для kNN, KMean и др.

In [3]:
import numpy as np
data = np.array([1, 1, 0, -1, 2, 1, 2, 3, -2, 4, 100]).reshape(-1, 1).astype(np.float64)

In [4]:
data

array([[  1.],
       [  1.],
       [  0.],
       [ -1.],
       [  2.],
       [  1.],
       [  2.],
       [  3.],
       [ -2.],
       [  4.],
       [100.]])

In [6]:
from sklearn.preprocessing import StandardScaler  

In [9]:
StandardScaler().fit_transform(data)

array([[-0.31922662],
       [-0.31922662],
       [-0.35434155],
       [-0.38945648],
       [-0.28411169],
       [-0.31922662],
       [-0.28411169],
       [-0.24899676],
       [-0.42457141],
       [-0.21388184],
       [ 3.15715128]])

## По формуле $$z = \frac{x - \mu}{\sigma}$$ получим тоже самое

In [12]:
stand = (data - data.mean())/data.std()

In [14]:
stand

array([[-0.31922662],
       [-0.31922662],
       [-0.35434155],
       [-0.38945648],
       [-0.28411169],
       [-0.31922662],
       [-0.28411169],
       [-0.24899676],
       [-0.42457141],
       [-0.21388184],
       [ 3.15715128]])

In [41]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [17]:
mean_squared_error(StandardScaler().fit_transform(data), stand)

0.0

In [18]:
std = np.sqrt(mean_squared_error(StandardScaler().fit_transform(data), stand))

In [19]:
std

0.0

In [22]:
stand.mean() # среднее равно нулю

0.0

# Нормализация 

In [25]:
from scipy.stats import beta
from scipy.stats import shapiro


In [29]:
data = beta(1, 10).rvs(1000)


In [30]:
data

array([1.39597740e-01, 2.09419572e-01, 3.12558404e-03, 1.67209282e-01,
       1.35218304e-02, 6.72430742e-02, 1.41136954e-01, 2.73150087e-01,
       2.21746736e-03, 6.15541359e-03, 7.81935907e-03, 1.83929514e-02,
       7.40653270e-03, 1.71462895e-01, 4.27373414e-02, 3.31805726e-02,
       3.64035871e-03, 9.32137139e-03, 6.69313200e-02, 7.87637131e-02,
       1.07199654e-01, 6.29443067e-02, 8.40563781e-04, 1.38813659e-01,
       1.30823918e-02, 9.36473787e-02, 4.12472215e-02, 4.09073865e-02,
       7.87562283e-02, 1.08055635e-03, 4.62740306e-02, 1.36705955e-01,
       8.76382797e-02, 7.56809223e-02, 2.57273741e-01, 2.01233210e-02,
       1.98127797e-01, 1.95477265e-01, 5.48241053e-02, 5.18582897e-02,
       4.01260944e-02, 2.25964662e-02, 2.07145110e-01, 1.87532889e-01,
       2.86592727e-01, 1.18702657e-01, 1.76293784e-02, 1.53009616e-01,
       1.16539702e-02, 3.11719572e-02, 7.94722415e-02, 2.44760371e-02,
       8.63680833e-02, 5.00671609e-02, 3.48022378e-02, 1.33361681e-01,
      

In [31]:
len(data)

1000

In [32]:
data = data.reshape(-1,1)
display(data)

array([[1.39597740e-01],
       [2.09419572e-01],
       [3.12558404e-03],
       [1.67209282e-01],
       [1.35218304e-02],
       [6.72430742e-02],
       [1.41136954e-01],
       [2.73150087e-01],
       [2.21746736e-03],
       [6.15541359e-03],
       [7.81935907e-03],
       [1.83929514e-02],
       [7.40653270e-03],
       [1.71462895e-01],
       [4.27373414e-02],
       [3.31805726e-02],
       [3.64035871e-03],
       [9.32137139e-03],
       [6.69313200e-02],
       [7.87637131e-02],
       [1.07199654e-01],
       [6.29443067e-02],
       [8.40563781e-04],
       [1.38813659e-01],
       [1.30823918e-02],
       [9.36473787e-02],
       [4.12472215e-02],
       [4.09073865e-02],
       [7.87562283e-02],
       [1.08055635e-03],
       [4.62740306e-02],
       [1.36705955e-01],
       [8.76382797e-02],
       [7.56809223e-02],
       [2.57273741e-01],
       [2.01233210e-02],
       [1.98127797e-01],
       [1.95477265e-01],
       [5.48241053e-02],
       [5.18582897e-02],


In [33]:
shapiro(data)

ShapiroResult(statistic=0.8758051991462708, pvalue=1.7316288426808352e-27)

In [34]:
shapiro(StandardScaler().fit_transform(data))

ShapiroResult(statistic=0.8758054971694946, pvalue=1.731718783609238e-27)

# с таким p-value придется отклонять нулевую гипотезу о нормальности данных. StandardScaler не делает распределение нормальным в строгом смысле слова.


In [35]:
from sklearn.preprocessing import MinMaxScaler

In [36]:
MinMaxScaler().fit_transform(data)

array([[3.01843915e-01],
       [4.52857738e-01],
       [6.67575238e-03],
       [3.61563410e-01],
       [2.91612251e-02],
       [1.45351965e-01],
       [3.05172997e-01],
       [5.90696982e-01],
       [4.71163648e-03],
       [1.32288050e-02],
       [1.68276617e-02],
       [3.96967064e-02],
       [1.59347821e-02],
       [3.70763317e-01],
       [9.23498562e-02],
       [7.16800436e-02],
       [7.78913034e-03],
       [2.00762820e-02],
       [1.44677689e-01],
       [1.70269325e-01],
       [2.31771867e-01],
       [1.36054396e-01],
       [1.73360702e-03],
       [3.00148068e-01],
       [2.82107874e-02],
       [2.02460392e-01],
       [8.91269573e-02],
       [8.83919467e-02],
       [1.70253137e-01],
       [2.25267383e-03],
       [9.99991677e-02],
       [2.95589432e-01],
       [1.89463640e-01],
       [1.63601726e-01],
       [5.56358903e-01],
       [4.34392281e-02],
       [4.28435374e-01],
       [4.22702683e-01],
       [1.18491657e-01],
       [1.12077056e-01],


In [39]:
(data - data.min()) / (data.max() - data.min())

array([[3.01843915e-01],
       [4.52857738e-01],
       [6.67575238e-03],
       [3.61563410e-01],
       [2.91612251e-02],
       [1.45351965e-01],
       [3.05172997e-01],
       [5.90696982e-01],
       [4.71163648e-03],
       [1.32288050e-02],
       [1.68276617e-02],
       [3.96967064e-02],
       [1.59347821e-02],
       [3.70763317e-01],
       [9.23498562e-02],
       [7.16800436e-02],
       [7.78913034e-03],
       [2.00762820e-02],
       [1.44677689e-01],
       [1.70269325e-01],
       [2.31771867e-01],
       [1.36054396e-01],
       [1.73360702e-03],
       [3.00148068e-01],
       [2.82107874e-02],
       [2.02460392e-01],
       [8.91269573e-02],
       [8.83919467e-02],
       [1.70253137e-01],
       [2.25267383e-03],
       [9.99991677e-02],
       [2.95589432e-01],
       [1.89463640e-01],
       [1.63601726e-01],
       [5.56358903e-01],
       [4.34392281e-02],
       [4.28435374e-01],
       [4.22702683e-01],
       [1.18491657e-01],
       [1.12077056e-01],


In [42]:
mean_absolute_error((data - data.min()) / (data.max() - data.min()), MinMaxScaler().fit_transform(data))

1.1260035872456831e-17

In [43]:
# будем считать, что они почти равны