# Categorical Data Encoding Methods

In [34]:
import pandas as pd

df = pd.DataFrame({
    'risk': ['low', 'medium', 'ZZZZZZ', 'low', 'low', 'high'],
    'class': ['1st', '3rd', '2nd', '1st', '3rd', '000000'],
})
df

Unnamed: 0,risk,class
0,low,1st
1,medium,3rd
2,ZZZZZZ,2nd
3,low,1st
4,low,3rd
5,high,000000


## OrdinalEncoder

In [36]:
from sklearn.preprocessing import OrdinalEncoder

# Specify the order of categories
categories = [
    ['low', 'medium', 'high'], # <-- categories of first feature
    ['1st', '2nd', '3rd'],     # <-- categories of second feature
] # <-- this is just a list of lists of strings (i.e., list[list[str]])

encoder = OrdinalEncoder(
    categories=categories, # <-- specify the categories
    handle_unknown='use_encoded_value',
    unknown_value=-1
)
# We want a pandas DataFrame as output rather than a NumPy array (default)
encoder = encoder.set_output(transform='pandas')

# Assigns integers to each category value found in the training data
encoder.fit(df)

encoded_data = encoder.transform(df)
encoded_data

Unnamed: 0,risk,class
0,0.0,0.0
1,1.0,2.0
2,-1.0,1.0
3,0.0,0.0
4,0.0,2.0
5,2.0,-1.0


## OneHotEncoder

In [40]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder instance
encoder = OneHotEncoder(
    sparse_output=False,    # <-- output is a dense array
    categories=categories,  # <-- specify the categories
    handle_unknown='ignore' # <-- ignore unknown categories
)
encoder.set_output(transform='pandas')

# Fit and transform the data
encoded_data = encoder.fit_transform(df)
encoded_data

Unnamed: 0,risk_low,risk_medium,risk_high,class_1st,class_2nd,class_3rd
0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,1.0,0.0,0.0,0.0
