In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# create sample data
df = pd.DataFrame({
    'input1': [1, 2, 3, 4, 5],
    'input2': ['A', 'A', 'B', 'B', 'C'],
    'input3': ['X', 'X', 'X', 'Y', 'Y']
})

df

Unnamed: 0,input1,input2,input3
0,1,A,X
1,2,A,X
2,3,B,X
3,4,B,Y
4,5,C,Y


In [3]:
# add columns to apply one hot encoding into a list
categorical_vars = ['input2', 'input3']

In [4]:
# enstantiate one hot encoder object and apply
one_hot_encoder = OneHotEncoder(sparse = False, drop = 'first') # returns array, drops one of the binary categorical columns to reduce multicollinearity
encoder_vars_array = one_hot_encoder.fit_transform([categorical_vars])

encoder_vars_array

array([], shape=(1, 0), dtype=float64)

In [5]:
# create object to get feature names
encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)

encoder_feature_names

array([], dtype=object)

In [6]:
# create data frame with encoder variables and feature names
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)
encoder_vars_df

0


In [7]:
# concat new dataframe to original resetting index (ensures no rows are misaligned)
df_new = pd.concat([df.reset_index (drop = True), encoder_vars_df.reset_index (drop = True)], axis = 1) # axis 1 concats columns and not rows
df_new

Unnamed: 0,input1,input2,input3
0,1,A,X
1,2,A,X
2,3,B,X
3,4,B,Y
4,5,C,Y


In [8]:
# drop original categorical columns
df_new.drop(categorical_vars, axis = 1, inplace = True) # axis 1 drops columns and not rows
df_new

Unnamed: 0,input1
0,1
1,2
2,3
3,4
4,5
