In [1]:
import os, sys
import numpy as np
import pandas as pd

In [2]:
df_data = pd.DataFrame({
    'Country': ['Taiwan','Australia','Ireland','Australia','Ireland','Taiwan'],
    'Age': [25,30,45,35,22,36],
    'Salary': [20000,32000,59000,60000,43000,52000]
})
display(df_data)

Unnamed: 0,Country,Age,Salary
0,Taiwan,25,20000
1,Australia,30,32000
2,Ireland,45,59000
3,Australia,35,60000
4,Ireland,22,43000
5,Taiwan,36,52000


In [3]:
from sklearn.preprocessing import LabelEncoder

features = ['Country']
df1 = df_data[features].copy()

# Label Encooding
le = LabelEncoder()

df1['Country_code'] = le.fit_transform(df1.Country)
display(df1)

country_dict = {
    x: le.transform([x])[0] for x in le.classes_
}
display(country_dict)

# Inverse code to label
df1['Country_inverse'] = le.inverse_transform(df1.Country_code)
display(df1)

Unnamed: 0,Country,Country_code
0,Taiwan,2
1,Australia,0
2,Ireland,1
3,Australia,0
4,Ireland,1
5,Taiwan,2


{'Australia': 0, 'Ireland': 1, 'Taiwan': 2}

Unnamed: 0,Country,Country_code,Country_inverse
0,Taiwan,2,Taiwan
1,Australia,0,Australia
2,Ireland,1,Ireland
3,Australia,0,Australia
4,Ireland,1,Ireland
5,Taiwan,2,Taiwan


In [4]:
# OneHotEncoder only handle number type column but cannot string type.
# String need to mapping as numerical in advance.

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

features = ['Country']
df2 = df_data[features].copy()

# Label encoding the column
le = LabelEncoder()
le_trans = le.fit_transform(df1.Country)
print('LabelEncoder classes:', list(le.classes_))

# One Hot Encooding
ohe = OneHotEncoder()
ohe_trans = ohe.fit_transform(le_trans.reshape(-1,1)).toarray().astype(int)

columns = ['OneHot_0', 'OneHot_1', 'OneHot_2']
df_ohe = pd.DataFrame(ohe_trans, dtype=str, columns=columns)

display(df_ohe)

res = pd.concat([df2, df_ohe], axis=1)
display(res)

LabelEncoder classes: ['Australia', 'Ireland', 'Taiwan']


Unnamed: 0,OneHot_0,OneHot_1,OneHot_2
0,0,0,1
1,1,0,0
2,0,1,0
3,1,0,0
4,0,1,0
5,0,0,1


Unnamed: 0,Country,OneHot_0,OneHot_1,OneHot_2
0,Taiwan,0,0,1
1,Australia,1,0,0
2,Ireland,0,1,0
3,Australia,1,0,0
4,Ireland,0,1,0
5,Taiwan,0,0,1


In [6]:
# get_dummies will transform all string columns to one hot encoding.
# The best way is specify the columns to be transform beforehand.

df3 = df_data.copy()
data_dum = pd.get_dummies(df3)
pd.DataFrame(data_dum)

Unnamed: 0,Age,Salary,Country_Australia,Country_Ireland,Country_Taiwan
0,25,20000,0,0,1
1,30,32000,1,0,0
2,45,59000,0,1,0
3,35,60000,1,0,0
4,22,43000,0,1,0
5,36,52000,0,0,1
