In [1]:
from sklearn import preprocessing
import numpy as np
X = np.array([[1., -2., 2.],
              [3., 0., 0.],
              [0., 1., -1]])

In [2]:
# データの標準化（縮尺変換）平均0、分散1
X_scaled = preprocessing.scale(X)
X_scaled

array([[-0.26726124, -1.33630621,  1.33630621],
       [ 1.33630621,  0.26726124, -0.26726124],
       [-1.06904497,  1.06904497, -1.06904497]])

In [3]:
X_scaled.mean(axis=0)

array([7.40148683e-17, 0.00000000e+00, 0.00000000e+00])

In [4]:
X_scaled.std(axis=0)

array([1., 1., 1.])

In [5]:
# 特徴量の正規化
X_normalized_l1 = preprocessing.normalize(X, norm='l1') # マンハッタン距離
X_normalized_l1

array([[ 0.2, -0.4,  0.4],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  0.5, -0.5]])

In [6]:
X_normalized_l2 = preprocessing.normalize(X, norm='l2') # ユークリッド距離
X_normalized_l2

array([[ 0.33333333, -0.66666667,  0.66666667],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [7]:
# 特徴量を範囲内に縮尺変換する
min_max_scaler = preprocessing.MinMaxScaler()
X_min_max = min_max_scaler.fit_transform(X)
X_min_max

array([[0.33333333, 0.        , 1.        ],
       [1.        , 0.66666667, 0.33333333],
       [0.        , 1.        , 0.        ]])

In [8]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-10, 10))
X_min_max2 = min_max_scaler.fit_transform(X)
X_min_max2

array([[ -3.33333333, -10.        ,  10.        ],
       [ 10.        ,   3.33333333,  -3.33333333],
       [-10.        ,  10.        , -10.        ]])

In [9]:
# 特徴量の二値化
X

array([[ 1., -2.,  2.],
       [ 3.,  0.,  0.],
       [ 0.,  1., -1.]])

In [10]:
binarizer = preprocessing.Binarizer(threshold=0.5)
X_binarized = binarizer.transform(X)
X_binarized

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [11]:
# 欠損データを扱う
from numpy import nan
X = np.array([[nan, 0, 3],
             [2, 9, -8],
             [1, nan, 1],
             [5, 2, 4],
             [7, 6, -3]])

In [13]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2



array([[ 3.75,  0.  ,  3.  ],
       [ 2.  ,  9.  , -8.  ],
       [ 1.  ,  4.25,  1.  ],
       [ 5.  ,  2.  ,  4.  ],
       [ 7.  ,  6.  , -3.  ]])

In [14]:
np.mean(X[1:, 0]), X2[0, 0]

(3.75, 3.75)

In [15]:
imp = Imputer(strategy='median')
X3 = imp.fit_transform(X)
X3



array([[ 3.5,  0. ,  3. ],
       [ 2. ,  9. , -8. ],
       [ 1. ,  4. ,  1. ],
       [ 5. ,  2. ,  4. ],
       [ 7. ,  6. , -3. ]])

In [16]:
np.median(X[1:, 0]), X3[0, 0]

(3.5, 3.5)