# 知识点：pandas 的 factorize 方法，用于独热编码

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

In [2]:
y = train.pop('SalePrice').values

In [3]:
y

array([208500, 181500, 223500, ..., 266500, 142125, 147500])

In [4]:
train['HouseStyle'].value_counts()

1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64

In [5]:
train['HouseStyle'].values.ndim

1

In [6]:
hs_train = train[['HouseStyle']].values.copy()

In [7]:
hs_train.shape

(1460, 1)

In [8]:
np.unique(hs_train)

array(['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story',
       'SFoyer', 'SLvl'], dtype=object)

In [9]:
np.unique(train['HouseStyle'].factorize()[0])

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
train['HouseStyle'].factorize()[1]

Index(['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf',
       '2.5Fin'],
      dtype='object')

In [16]:
train['HouseStyle'].factorize()

(array([0, 1, 0, ..., 0, 1, 1]),
 Index(['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf',
        '2.5Fin'],
       dtype='object'))

参考：  
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.factorize.html


In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')


hs_train_transformed = ohe.fit_transform(
    train['HouseStyle'].factorize()[0].reshape(-1, 1))

In [12]:
hs_train_transformed.shape

(1460, 8)

In [13]:
train['HouseStyle'][20]

'2Story'

In [14]:
hs_train_transformed[20]

array([1., 0., 0., 0., 0., 0., 0., 0.])

In [65]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

testdata = pd.DataFrame({'pet': ['cat', 'dog', 'dog', 'fish'], 'age': [4, 6, 3, 3],
                         'salary': [4, 5, 1, 1]})

In [66]:
testdata

Unnamed: 0,pet,age,salary
0,cat,4,4
1,dog,6,5
2,dog,3,1
3,fish,3,1


In [75]:
ohe = OneHotEncoder(sparse=False)
# testdata.age 这里与 testdata[['age']]等价
a1 = ohe.fit_transform(testdata[['age']])

In [76]:
a1

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [79]:
ohe.active_features_

array([3, 4, 6])

In [72]:
pd.get_dummies(testdata['age'], prefix='age')

Unnamed: 0,age_3,age_4,age_6
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0


`LabelEncoder` 把类别特征变到 0 ~ n_classes-1 之间。

In [85]:
le = LabelEncoder()
a2 = le.fit_transform(testdata['pet'])

In [89]:
le.classes_

array(['cat', 'dog', 'fish'], dtype=object)

In [86]:
a2

array([0, 1, 1, 2])

In [84]:
testdata['pet'].factorize()

(array([0, 1, 1, 2]), Index(['cat', 'dog', 'fish'], dtype='object'))