# 類別變數編碼

## 測試資料

In [1]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## LabelEncoder

In [2]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit_transform(df['size'])



array([1, 0, 2])

In [3]:
encoder.inverse_transform([1, 0, 2])

array(['M', 'L', 'XL'], dtype=object)

## Pandas Map

In [4]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


## OrdinalEncoder

In [5]:
from sklearn.preprocessing import OrdinalEncoder

data = [['Male', 1], ['Female', 3], ['Female', 2]]
encoder = OrdinalEncoder()
encoder.fit_transform(data)

array([[1., 0.],
       [0., 2.],
       [0., 1.]])

# One Hot Encoding with Pandas

In [6]:
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']

pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')

Unnamed: 0,size,price,classlabel,is_blue,is_green,is_red
0,M,10.1,class1,0,1,0
1,L,13.5,class2,0,0,1
2,XL,15.3,class1,1,0,0


In [9]:
# pandas v1.5 above
df2 = pd.get_dummies(df, columns=["color"], prefix='is', prefix_sep='_')
pd.from_dummies(df2[['is_blue','is_green','is_red']], sep="_")

Unnamed: 0,is
0,green
1,red
2,blue


## One-hot Encoding with Scikit-learn

In [13]:
from sklearn.preprocessing import OneHotEncoder

# 測試資料
X = [['Male', 1], ['Female', 3], ['Female', 2]]

# 轉換
encoder = OneHotEncoder(handle_unknown='ignore')
X_new = encoder.fit_transform(X)
X_new.toarray()

array([[0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [14]:
# 類別
encoder.categories_

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [15]:
# 還原
encoder.inverse_transform(X_new)

array([['Male', 1],
       ['Female', 3],
       ['Female', 2]], dtype=object)

In [16]:
# 指定欄位名稱
encoder.get_feature_names_out(['gender', 'group'])

array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
      dtype=object)

## 完整的表格處理程序

In [27]:
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']

In [28]:
# One-hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')
color_new = encoder.fit_transform(df[['color']])

# 指定欄位名稱
column_names = encoder.get_feature_names_out(encoder.feature_names_in_)

# 轉換
df_new = pd.DataFrame(color_new.toarray(), columns=column_names)
df_new

Unnamed: 0,color_blue,color_green,color_red
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0


In [29]:
# 刪除原欄位 'color'
df.drop(['color'], axis=1, inplace=True)

# 合併表格
df2 = pd.concat([df, df_new], axis=1)
df2

Unnamed: 0,size,price,classlabel,color_blue,color_green,color_red
0,M,10.1,class1,0.0,1.0,0.0
1,L,13.5,class2,0.0,0.0,1.0
2,XL,15.3,class1,1.0,0.0,0.0


In [30]:
# 存檔
import joblib

joblib.dump(encoder, 'color.joblib')

['color.joblib']