# What is encoding?

Encoding involves converting categories into numeric representations.

Encoding includes techniques like one-hot encoding, ordinal encoding, and binary encoding.
One-hot encoding represents each category with a binary vector, suitable for nominal data.
Ordinal encoding assigns unique integers to categories based on their order, suitable for
ordinal data. Binary encoding combines ordinal and one-hot encoding, converting ordinal
numbers into binary code.

Notebook tested in venv-3.11.4

In [12]:
import pandas as pd

# Sample dataset
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Blue'],
        'Shape': ['Circle', 'Triangle', 'Square', 'Circle', 'Square']}

# Creating DataFrame
df = pd.DataFrame(data)

# One-hot encoding using Pandas
encoded_df = pd.get_dummies(df)

# Displaying original and encoded DataFrame
print("Original DataFrame:")
print(df)
print("\nEncoded DataFrame:")
print(encoded_df)

Original DataFrame:
   Color     Shape
0    Red    Circle
1  Green  Triangle
2   Blue    Square
3    Red    Circle
4   Blue    Square

Encoded DataFrame:
   Color_Blue  Color_Green  Color_Red  Shape_Circle  Shape_Square  \
0       False        False       True          True         False   
1       False         True      False         False         False   
2        True        False      False         False          True   
3       False        False       True          True         False   
4        True        False      False         False          True   

   Shape_Triangle  
0           False  
1            True  
2           False  
3           False  
4           False  


In [14]:
def custom_get_dummies(df):
    """
    Custom implementation of one-hot encoding using Pandas.
    
    Parameters:
    - df (DataFrame): Input DataFrame with categorical variables.
    
    Returns:
    - encoded_df (DataFrame): DataFrame with one-hot encoded columns.
    """
    # Copy the original DataFrame to avoid modifying it
    encoded_df = df.copy()
    
    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Check if the column contains categorical data
        if df[column].dtype == 'object':
            # Get unique categories in the column
            categories = df[column].unique()
            
            # Iterate over each category and create a new binary column for it
            for category in categories:
                # Create a new binary column with the category name
                encoded_df[f"{column}_{category}"] = (df[column] == category).astype(int)
            
            # Drop the original categorical column
            encoded_df.drop(columns=[column], inplace=True)
    
    return encoded_df

# Test the custom_get_dummies function with sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Blue'],
        'Shape': ['Circle', 'Triangle', 'Square', 'Circle', 'Square']}
df = pd.DataFrame(data)

encoded_df = custom_get_dummies(df)
print(encoded_df)


   Color_Red  Color_Green  Color_Blue  Shape_Circle  Shape_Triangle  \
0          1            0           0             1               0   
1          0            1           0             0               1   
2          0            0           1             0               0   
3          1            0           0             1               0   
4          0            0           1             0               0   

   Shape_Square  
0             0  
1             0  
2             1  
3             0  
4             1  


In [13]:
# For one-hot encoding we can also use scikit learn lib
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [9]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Blue']}
df = pd.DataFrame(data)

# Encoding using OneHotEncoder
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[['Color']]).toarray()

# Creating DataFrame from encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Color']))

print("Original DataFrame:")
print(df)
print("\nEncoded DataFrame:")
print(encoded_df)

Original DataFrame:
   Color
0    Red
1  Green
2   Blue
3    Red
4   Blue

Encoded DataFrame:
   Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         0.0          1.0        0.0
2         1.0          0.0        0.0
3         0.0          0.0        1.0
4         1.0          0.0        0.0


# Binary encoder

Install the category_encoders library if you haven't already

In [10]:
!pip install category_encoders


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [11]:
import pandas as pd
import category_encoders as ce

# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Blue']}
df = pd.DataFrame(data)

# Encoding using Binary Encoder
encoder = ce.BinaryEncoder(cols=['Color'])
encoded_data = encoder.fit_transform(df)

print("Original DataFrame:")
print(df)
print("\nEncoded DataFrame:")
print(encoded_data) # Red is 0, green is 1 and blue is 2 - then translated to binary representation

Original DataFrame:
   Color
0    Red
1  Green
2   Blue
3    Red
4   Blue

Encoded DataFrame:
   Color_0  Color_1
0        0        1
1        1        0
2        1        1
3        0        1
4        1        1
