In [52]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder

# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder

In [53]:
# let's load the data set

data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [54]:
# make a list with the categorical variables

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [55]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

 ## One Hot Encoding with pandas

In [56]:
# let's inspect the unique values of A4

X_train['A4'].unique()

array(['u', 'y', 'Missing', 'l'], dtype=object)

In [57]:
# let's one hot encode A4

tmp = pd.get_dummies(X_train['A4'], drop_first=True)

print(tmp.head())

         l      u      y
596  False   True  False
303  False   True  False
204  False  False   True
351  False  False   True
118  False   True  False


In [58]:
# now let's encode all cateogrical variables together: train set

X_train_enc = pd.get_dummies(X_train[vars_categorical], drop_first=True)

print(X_train_enc.head())

      A1_a   A1_b   A4_l   A4_u   A4_y   A5_g  A5_gg   A5_p  A6_aa   A6_c  \
596   True  False  False   True  False   True  False  False  False   True   
303   True  False  False   True  False   True  False  False  False  False   
204  False   True  False  False   True  False  False   True  False  False   
351  False   True  False  False   True  False  False   True  False  False   
118  False   True  False   True  False   True  False  False  False  False   

     ...   A7_j   A7_n   A7_o   A7_v   A7_z   A9_t  A10_t  A12_t  A13_p  A13_s  
596  ...  False  False  False   True  False   True   True   True  False  False  
303  ...  False  False  False   True  False  False  False  False  False  False  
204  ...  False  False  False   True  False   True   True  False  False  False  
351  ...  False  False  False  False  False  False  False  False  False  False  
118  ...  False  False  False   True  False   True   True   True  False  False  

[5 rows x 36 columns]


In [59]:
# and in the test set

X_test_enc = pd.get_dummies(X_test[vars_categorical], drop_first=True)

X_test_enc.head()

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,True,False,False,True,False,True,False,False,False,False,...,False,False,False,True,False,True,True,True,False,False
586,False,True,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,True,True,False,False
140,True,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,True,True,False,False
492,False,True,False,True,False,True,False,False,False,False,...,False,False,False,True,False,True,True,False,False,False
350,True,False,False,True,False,True,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False


 ## One Hot Encoding with Scikit-learn

In [60]:
# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder
# we create and train the encoder

encoder = OneHotEncoder(categories='auto',
                        drop='first',  # to return k-1, use drop=false to return k dummies
                        sparse_output=False)

In [61]:
# fit the encoder to the train set: it will learn the categories to encode

encoder.fit(X_train[vars_categorical])

In [62]:
# transform the train and test sets

X_train_enc = encoder.transform(X_train[vars_categorical])
X_test_enc = encoder.transform(X_test[vars_categorical])

In [63]:
# let's inspect the train set

print(pd.DataFrame(X_train_enc).head())

    0    1    2    3    4    5    6    7    8    9   ...   26   27   28   29  \
0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  1.0   
1  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
2  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
3  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   

    30   31   32   33   34   35  
0  0.0  1.0  1.0  1.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  1.0  1.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  1.0  1.0  1.0  0.0  0.0  

[5 rows x 36 columns]


In [64]:
# let's inspect the test set

pd.DataFrame(X_test_enc).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [65]:
X_test_enc

array([[1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

 ## One hot encoding with Feature-Engine

In [66]:
# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder
# let's create the encoder

ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=True)  # to return k-1, false to return k

In [67]:
# fit the encoder to the train set: it will learn the variables and 
# categories to encode

ohe_enc.fit(X_train)

In [68]:
# we can see which variables the encoder will encode

ohe_enc.variables

In [69]:
# let's transform train and test set

X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [70]:
# let's inspect the encoded train set

X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
596,46.08,3.0,2.375,8,396.0,4159,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
303,15.92,2.875,0.085,0,120.0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
204,36.33,2.125,0.085,1,50.0,1187,0,1,0,1,...,0,0,0,0,0,1,1,0,1,0
351,22.17,0.585,0.0,0,100.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
118,57.83,7.04,14.0,6,360.0,1332,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0


In [71]:
# let's inspect the encoded test set

X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
14,45.83,10.5,5.0,7,0.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
586,64.08,20.0,17.5,9,0.0,1000,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0
140,31.25,3.75,0.625,9,181.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
492,39.25,9.5,6.5,14,240.0,4607,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
350,26.17,2.0,0.0,0,276.0,1,1,0,1,0,...,0,0,1,0,0,0,0,1,1,0


In [72]:
X_test_enc.dtypes

A2            float64
A3            float64
A8            float64
A11             int64
A14           float64
A15             int64
A1_a            int32
A1_b            int32
A4_u            int32
A4_y            int32
A4_Missing      int32
A5_g            int32
A5_p            int32
A5_Missing      int32
A6_c            int32
A6_q            int32
A6_w            int32
A6_ff           int32
A6_m            int32
A6_i            int32
A6_e            int32
A6_cc           int32
A6_x            int32
A6_d            int32
A6_k            int32
A6_j            int32
A6_Missing      int32
A6_aa           int32
A7_v            int32
A7_ff           int32
A7_h            int32
A7_dd           int32
A7_z            int32
A7_bb           int32
A7_j            int32
A7_Missing      int32
A7_n            int32
A9_t            int32
A10_t           int32
A12_t           int32
A13_g           int32
A13_s           int32
dtype: object