In [18]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

data = np.array([[ 3, -1.5,  2, -5.4],
                 [ 0,  4,  -0.3, 2.1],
                 [ 1,  3.3, -1.9, -4.3]])

# mean removal 标准化为均值为0方差为1
data_standardized = preprocessing.scale(data)
print("\nMean =", data_standardized.mean(axis=0))
print("Std deviation =", data_standardized.std(axis=0))

# min max scaling
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)
print("\nMin max scaled data:\n", data_scaled)

# normalization  l1规范化(除以各项绝对值的和)
data_normalized = preprocessing.normalize(data, norm='l1')
print("\nL1 normalized data:\n", data_normalized)

# normalization  l2规范化(除以各项平方的和的平方根)
data_normalized = preprocessing.normalize(data, norm='l2')
print("\nL2 normalized data:\n", data_normalized)

# binarization  特征二值化(给定阈值，将特征转换为0/1)
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print("\nBinarized data:\n", data_binarized)

# one hot encoding  类别特征编码(按列进行转换)
encoder = preprocessing.OneHotEncoder()
e_data=np.array([[0, 2, 1, 12], 
                 [1, 3, 5, 3], 
                 [2, 3, 2, 12], 
                 [1, 2, 4, 3]])
encoder.fit(e_data)
print("\nNew Encoded vector:\n",encoder.transform(e_data).toarray())

encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print("\nEncoded vector:\n", encoded_vector)



Mean = [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]
Std deviation = [1. 1. 1. 1.]

Min max scaled data:
 [[1.         0.         1.         0.        ]
 [0.         1.         0.41025641 1.        ]
 [0.33333333 0.87272727 0.         0.14666667]]

L1 normalized data:
 [[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]

L2 normalized data:
 [[ 0.45017448 -0.22508724  0.30011632 -0.81031406]
 [ 0.          0.88345221 -0.06625892  0.46381241]
 [ 0.17152381  0.56602858 -0.32589524 -0.73755239]]

Binarized data:
 [[1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [0. 1. 0. 0.]]

New Encoded vector:
 [[1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
 [0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.]]

Encoded vector:
 [[0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
label_encoder = preprocessing.LabelEncoder()
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']
label_encoder.fit(input_classes)

# print classes
print("\nClass mapping:")
for i, item in enumerate(label_encoder.classes_):
    print(item, '-->', i)

# transform a set of classes
labels = ['toyota', 'ford', 'audi']
encoded_labels = label_encoder.transform(labels)
print("\nLabels =", labels) 
print("Encoded labels =", list(encoded_labels))

# inverse transform
encoded_labels = [2, 1, 0, 3, 1]
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("\nEncoded labels =", encoded_labels)
print("Decoded labels =", list(decoded_labels))


Class mapping:
audi --> 0
bmw --> 1
ford --> 2
toyota --> 3

Labels = ['toyota', 'ford', 'audi']
Encoded labels = [3, 2, 0]

Encoded labels = [2, 1, 0, 3, 1]
Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']


In [13]:
demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1],
                        'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
print(demo_df)

#get_dummies 自动编码字符串特征，不会改变整数特征
print(pd.get_dummies(demo_df))
print(pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature']))  #指定对哪些列进行编码，整数型也可以指定

#demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)  #将Integer Feature列改为字符串类型


# 设置 sparse=False 返回一个 numpy array, 而不是一个 sparse matrix(索引加值表示的稀疏矩阵)
ohe = preprocessing.OneHotEncoder(sparse=False)
print(ohe.fit_transform(demo_df))

#默认sparse=True
ohe = preprocessing.OneHotEncoder()
print(demo_df)
ohe.fit(demo_df)
print(ohe.transform(demo_df))  #按索引加值的形式显示矩阵
print(ohe.transform(demo_df).toarray())  #转化为矩阵，也可设置sparse=False得到
print(ohe.fit_transform(demo_df))

   Integer Feature Categorical Feature
0                0               socks
1                1                 fox
2                2               socks
3                1                 box
   Integer Feature  Categorical Feature_box  Categorical Feature_fox  \
0                0                        0                        0   
1                1                        0                        1   
2                2                        0                        0   
3                1                        1                        0   

   Categorical Feature_socks  
0                          1  
1                          0  
2                          1  
3                          0  
   Integer Feature_0  Integer Feature_1  Integer Feature_2  \
0                  1                  0                  0   
1                  0                  1                  0   
2                  0                  0                  1   
3                  0                  1  

In [23]:
X = np.arange(6).reshape(3, 2)
X

poly = preprocessing.PolynomialFeatures(degree = 2)
print(poly.fit_transform(X))
#array([[ 1.,  0.,  1.,  0.,  0.,  1.],
#       [ 1.,  2.,  3.,  4.,  6.,  9.],
#       [ 1.,  4.,  5., 16., 20., 25.]])
# 设置参数interaction_only = True，不包含单个自变量****n(n>1)特征数据
poly = preprocessing.PolynomialFeatures(degree = 2, interaction_only = True)
print(poly.fit_transform(X))
#array([[ 1.,  0.,  1.,  0.],
#       [ 1.,  2.,  3.,  6.],
#       [ 1.,  4.,  5., 20.]])
# 再添加 设置参数include_bias= False，不包含偏差项数据（最左边全为1的一列数据）
poly = preprocessing.PolynomialFeatures(degree = 2, interaction_only = True, include_bias=False)
print(poly.fit_transform(X))
#array([[ 0.,  1.,  0.],
#       [ 2.,  3.,  6.],
#       [ 4.,  5., 20.]])

[[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]
[[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]
[[ 0.  1.  0.]
 [ 2.  3.  6.]
 [ 4.  5. 20.]]
