# 数据预处理技术

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

data = 2 * np.random.rand(4, 5)
linear = np.random.randint(low=1, high=10, size=10)
print("矩阵：{}\n 数组：{}".format(data, linear))

矩阵：[[0.36061753 1.34441427 0.6076037  1.69423223 1.70551281]
 [1.90243471 1.95286415 1.47711609 0.19745964 1.60749523]
 [1.14232471 0.13773371 1.41568391 0.35927796 0.02420742]
 [1.92927371 0.06090296 0.34754033 0.10869699 0.01728238]]
 数组：[3 9 1 2 3 6 3 8 9 1]


## 1. Mean removal 均值移除

In [2]:
data_stan = preprocessing.scale(data)
linear_stan = preprocessing.scale(linear)
print("数据：{}\n均值：{}\n标准差：{}".format(data_stan, data_stan.mean(axis=0), data_stan.std(axis=0)))
print("数据：{}\n均值：{}\n标准差：{}".format(linear_stan, linear_stan.mean(axis=0), linear_stan.std(axis=0)))

数据：[[-1.50969469  0.58480354 -0.71804018  1.71510757  1.05896687]
 [ 0.88245869  1.3411743   1.04374312 -0.60952333  0.9392313 ]
 [-0.29686382 -0.91523434  0.91927084 -0.35820401 -0.99486936]
 [ 0.92409981 -1.0107435  -1.24497378 -0.74738023 -1.00332881]]
均值：[ 0.00000000e+00  1.11022302e-16 -5.55111512e-17 -1.11022302e-16
 -1.11022302e-16]
标准差：[1. 1. 1. 1. 1.]
数据：[-0.49319696  1.47959089 -1.15079291 -0.82199494 -0.49319696  0.49319696
 -0.49319696  1.15079291  1.47959089 -1.15079291]
均值：0.0
标准差：1.0


## 2. 范围缩放

In [6]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 10))  # sklearn.preprocessing._data.MinMaxScaler 对象
data_scaled = scaler.fit_transform(data)
data_scaled

array([[ 0.        ,  6.78402555,  2.30231011, 10.        , 10.        ],
       [ 9.82890453, 10.        , 10.        ,  0.55982772,  9.41940639],
       [ 4.98329202,  0.40609052,  9.45614818,  1.58041886,  0.04101956],
       [10.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

## 3. 归一化
特征向量调整为L1范数，

In [7]:
data_normalized = preprocessing.normalize(data, norm='l1')
data_normalized

array([[0.06312912, 0.23535096, 0.10636611, 0.29658952, 0.29856429],
       [0.26654563, 0.27361118, 0.20695524, 0.0276656 , 0.22522235],
       [0.37097767, 0.04472995, 0.45975291, 0.11667795, 0.00786152],
       [0.78308096, 0.02472016, 0.1410646 , 0.04411947, 0.00701482]])

## 4. 二值化

In [11]:
data_bin = preprocessing.Binarizer(threshold=1).transform(data)
data_bin, data

(array([[0., 1., 0., 1., 1.],
        [1., 1., 1., 0., 1.],
        [1., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.]]),
 array([[0.36061753, 1.34441427, 0.6076037 , 1.69423223, 1.70551281],
        [1.90243471, 1.95286415, 1.47711609, 0.19745964, 1.60749523],
        [1.14232471, 0.13773371, 1.41568391, 0.35927796, 0.02420742],
        [1.92927371, 0.06090296, 0.34754033, 0.10869699, 0.01728238]]))

## 5. One-hot编码

In [13]:
# 先创建一个测试的dataframe
cities = ['Beijing', 'Shanghai', 'Hangzhou', 'Ningbo', 'Hangzhou']
dic = {'city': cities}
city_df = pd.DataFrame(dic)
city_df

Unnamed: 0,city
0,Beijing
1,Shanghai
2,Hangzhou
3,Ningbo
4,Hangzhou


In [20]:
encoder = preprocessing.OneHotEncoder()
target_col: np.ndarray = city_df['city'].values.reshape(-1, 1)
encoder.fit(target_col)
encoded_vector = encoder.transform(target_col).toarray()
encoded_vector = encoded_vector.astype('int8')
type(encoded_vector)

numpy.ndarray