In [12]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for encoding using feature-engine
from category_encoders import HashingEncoder

In [13]:
data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [14]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data,  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 16), (207, 16))

In [15]:
X_train['A7'].unique()

array(['v', 'ff', 'h', 'dd', 'z', 'bb', 'j', 'Missing', 'n', 'o'],
      dtype=object)

In [16]:
# create the feature hashing encoder

encoder = HashingEncoder(cols=['A7'], n_components=4)

In [17]:
# fit the transformer to the train set

encoder.fit(X_train)

In [18]:
# let's inspect the hashing method

encoder.hash_method

'md5'

In [19]:
# let's transform train and test sets

X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [20]:
print(X_train_enc.head())

     col_0  col_1  col_2  col_3 A1     A2     A3 A4 A5  A6      A8 A9 A10  \
596      0      0      1      0  a  46.08  3.000  u  g   c   2.375  t   t   
303      0      0      1      0  a  15.92  2.875  u  g   q   0.085  f   f   
204      0      0      1      0  b  36.33  2.125  y  p   w   0.085  t   t   
351      0      1      0      0  b  22.17  0.585  y  p  ff   0.000  f   f   
118      0      0      1      0  b  57.83  7.040  u  g   m  14.000  t   t   

     A11 A12 A13    A14   A15  A16  
596    8   t   g  396.0  4159    1  
303    0   f   g  120.0     0    0  
204    1   f   g   50.0  1187    1  
351    0   f   g  100.0     0    0  
118    6   t   g  360.0  1332    1  


In [21]:
X_test_enc.head()

Unnamed: 0,col_0,col_1,col_2,col_3,A1,A2,A3,A4,A5,A6,A8,A9,A10,A11,A12,A13,A14,A15,A16
14,0,0,1,0,a,45.83,10.5,u,g,q,5.0,t,t,7,t,g,0.0,0,1
586,0,1,0,0,b,64.08,20.0,u,g,x,17.5,t,t,9,t,g,0.0,1000,1
140,0,1,0,0,a,31.25,3.75,u,g,cc,0.625,t,t,9,t,g,181.0,0,1
492,0,0,1,0,b,39.25,9.5,u,g,m,6.5,t,t,14,f,g,240.0,4607,1
350,0,1,0,0,a,26.17,2.0,u,g,j,0.0,f,f,0,t,g,276.0,1,0
