<a href="https://colab.research.google.com/github/manjunath-hanmantgad/python-development/blob/master/Encoding_Categorical_Variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. ordinal categorical variables (like grades of student A , B , C )
2. Nominal categorical variables dont follow any order. They are simply variables. like city names , country names etc.
3. Since our algos understand numbers and not strings so we need to convert these variables into numbers -- this is called as "categorical encoding."

**Types of categorical encoding**

1. create binary variables with one hot encoding
2. one hot encoding for frequent categories.
3. replace the categories with count or frequency of obserbvations.
4. replace categories with ordinal numbers (Ordinal)
5. Ordinal encoding based on target value
6. implement target mean encoding ( target median ?)
7. group rare or infrequent categories
8. perform binary encoding

In [46]:
# need category_encoders library as well
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
import random
import pandas as pd
import numpy as np

data = pd.read_csv('/content/crx.data')
data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [48]:
# create variables for columns
varnames = [f"A{s}" for s in range(1,17)]

# add these variables to columns

data.columns = varnames 
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [49]:
# replace question marks with NaN values.
data = data.replace("?", np.nan)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
1,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
2,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
4,b,32.08,4.000,u,g,m,v,2.50,t,f,0,t,g,00360,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
686,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [50]:
# cast variables as float
data["A2"] = data["A2"].astype("float")
data["A14"] = data["A14"].astype("float")

# encode target variable as binary 
data["A16"] = data["A16"].map({"+":1, "-":0})

# rename traget variable 
data.rename(columns={"A16":"target"}, inplace=True)
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,1


In [51]:
# list containing categorical and numerical variables

cat_cols = [
    c for c in data.columns if data[c].dtypes=="O"
]

num_cols = [
    c for c in data.columns if data[c].dtypes!= "O"
]

cat_cols
#num_cols

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [52]:
num_cols

['A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'target']

In [53]:
# fill missing data

data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna("Missing")

# now save the prepared data

data.to_csv("prepared_data.csv", index=False)

In [54]:
# now I think this function will be needed everytime!

### One hot encoding

In [55]:
# a variable with M unique values we can create encoding for M-1 variables.
# meaning if Gender is considered then M = 2 ( male female other)


In [56]:
# read in new prepared data

data = pd.read_csv('/content/prepared_data.csv')

# separate into train and test 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(
    data.drop(labels=["target"], axis=1),
    data["target"],
    test_size=0.25,
    random_state=42
)

print(f"shape of X_train:, {len(X_train)}")

shape of X_train:, 516


In [57]:
# inspect missing values 
X_train["A5"].unique()

array(['g', 'p', 'Missing', 'gg'], dtype=object)

In [58]:
# applying get_dummies to train and test data 

X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_test_enc = pd.get_dummies(X_test, drop_first=True)

X_train_enc.head()


Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
645,20.83,8.5,0.165,0,0.0,351,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
163,60.58,16.5,11.0,0,21.0,10561,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0
380,22.67,7.0,0.165,0,160.0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
602,20.08,0.25,0.125,0,200.0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
431,21.83,1.54,0.085,0,356.0,0,0,1,0,1,...,0,0,0,1,0,0,0,1,0,0


In [59]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop="first", sparse=False) # transformer will return numpy array
# rather than a sparse matrix , but why ?

In [60]:
# list of variable names 

vars_categorical = X_train.select_dtypes(include="O").columns.to_list()

# fit the encoder 
encoder.fit(X_train[vars_categorical])

# inspect the categories 

encoder.categories_

[array(['Missing', 'a', 'b'], dtype=object),
 array(['Missing', 'l', 'u', 'y'], dtype=object),
 array(['Missing', 'g', 'gg', 'p'], dtype=object),
 array(['Missing', 'aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm',
        'q', 'r', 'w', 'x'], dtype=object),
 array(['Missing', 'bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'],
       dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [61]:
X_train_enc = encoder.transform(

    X_train[vars_categorical])

X_test_enc = encoder.transform(

    X_test[vars_categorical])

In [62]:
# extract names of variables 

encoder.get_feature_names_out()

array(['A1_a', 'A1_b', 'A4_l', 'A4_u', 'A4_y', 'A5_g', 'A5_gg', 'A5_p',
       'A6_aa', 'A6_c', 'A6_cc', 'A6_d', 'A6_e', 'A6_ff', 'A6_i', 'A6_j',
       'A6_k', 'A6_m', 'A6_q', 'A6_r', 'A6_w', 'A6_x', 'A7_bb', 'A7_dd',
       'A7_ff', 'A7_h', 'A7_j', 'A7_n', 'A7_o', 'A7_v', 'A7_z', 'A9_t',
       'A10_t', 'A12_t', 'A13_p', 'A13_s'], dtype=object)

### One hot encoding for frequent categories

In [63]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,1


In [64]:
!pip install feature_engine
from feature_engine.encoding import OneHotEncoder 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [65]:
# The most frequent categories need to be determined in the train set. This is to avoid data leakage.

X_train["A7"].unique() 

array(['v', 'h', 'ff', 'z', 'bb', 'Missing', 'dd', 'j', 'n', 'o'],
      dtype=object)

In [66]:
# count number of observations, sort them and list out top 5

X_train["A7"].value_counts().sort_values(ascending=False).head(5)

v          294
h          107
ff          43
bb          40
Missing      9
Name: A7, dtype: int64

In [67]:
top_5 = [
    x for x in X_train["A7"].value_counts().sort_values(ascending=False).head(5).index
]

# add binary variable per top category to train and test sets

for label in top_5:
  X_train[f"A7_{label}"] = np.where(X_train["A7"]==label,1,0)
  X_test[f"A7_{label}"] = np.where(X_test["A7"]==label,1,0)

# display top 10 rows of original and encoded variable A7 in training set

X_train[["A7"] + [f"A7_{label}" for label in top_5]].head(50)

Unnamed: 0,A7,A7_v,A7_h,A7_ff,A7_bb,A7_Missing
645,v,1,0,0,0,0
163,v,1,0,0,0,0
380,v,1,0,0,0,0
602,v,1,0,0,0,0
431,v,1,0,0,0,0
340,v,1,0,0,0,0
275,h,0,1,0,0,0
429,ff,0,0,1,0,0
104,z,0,0,0,0,0
114,v,1,0,0,0,0


In [68]:
# set up one hot encoder to encode top 5 most frequent categories 
onehot_enc = OneHotEncoder(top_categories=5,variables=["A7","A8"]
                           )
onehot_enc_1 = OneHotEncoder(top_categories=5,variables=["A7"]
                           )

In [69]:
# fit the encoder

#onehot_enc.fit(X_train)
onehot_enc_1.fit(X_train) # since A7 is only encoded

# encode train and test sets 

# X_train_enc = onehot_enc.transform(X_train)
# X_test_enc = onehot_enc.transform(X_test)

X_train_enc = onehot_enc_1.transform(X_train)
X_test_enc = onehot_enc_1.transform(X_test)

X_train.head()
print("*****************************")
X_test.head()

*****************************


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A7_v,A7_h,A7_ff,A7_bb,A7_Missing
286,b,29.5,0.58,u,g,w,v,0.29,f,t,1,f,g,340.0,2803,1,0,0,0,0
655,b,25.67,3.25,u,g,c,h,2.29,f,t,1,t,g,416.0,21,0,1,0,0,0
257,a,20.75,9.54,u,g,i,v,0.04,f,f,0,f,g,200.0,1000,1,0,0,0,0
336,a,34.83,1.25,y,p,i,h,0.5,f,f,0,t,g,160.0,0,0,1,0,0,0
318,b,36.75,0.125,y,p,c,v,1.5,f,f,0,t,g,232.0,113,1,0,0,0,0


### replace the categories with count or frequency of obserbvations.

In [70]:
# if 50 out of 100 observations show gender as female then we will replace female 
# with 50 or 0.5 (for frequency encoding)

# count of A7 to dict 

counts=X_train["A7"].value_counts().to_dict()
counts

{'v': 294,
 'h': 107,
 'ff': 43,
 'bb': 40,
 'Missing': 9,
 'j': 8,
 'z': 6,
 'dd': 4,
 'n': 3,
 'o': 2}

In [71]:
# repalce categories with their counts 

X_train["A7"] = X_train["A7"].map(counts)
X_test["A7"] = X_test["A7"].map(counts)

X_train.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A7_v,A7_h,A7_ff,A7_bb,A7_Missing
645,a,20.83,8.5,u,g,c,294,0.165,f,f,0,f,g,0.0,351,1,0,0,0,0
163,a,60.58,16.5,u,g,q,294,11.0,t,f,0,t,g,21.0,10561,1,0,0,0,0
380,a,22.67,7.0,u,g,c,294,0.165,f,f,0,f,g,160.0,0,1,0,0,0,0
602,b,20.08,0.25,u,g,q,294,0.125,f,f,0,f,g,200.0,0,1,0,0,0,0
431,b,21.83,1.54,u,g,k,294,0.085,f,f,0,t,g,356.0,0,1,0,0,0,0


In [72]:
X_test.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A7_v,A7_h,A7_ff,A7_bb,A7_Missing
286,b,29.5,0.58,u,g,w,294,0.29,f,t,1,f,g,340.0,2803,1,0,0,0,0
655,b,25.67,3.25,u,g,c,107,2.29,f,t,1,t,g,416.0,21,0,1,0,0,0
257,a,20.75,9.54,u,g,i,294,0.04,f,f,0,f,g,200.0,1000,1,0,0,0,0
336,a,34.83,1.25,y,p,i,107,0.5,f,f,0,t,g,160.0,0,0,1,0,0,0
318,b,36.75,0.125,y,p,c,294,1.5,f,f,0,t,g,232.0,113,1,0,0,0,0


### replace categories with ordinal numbers (Ordinal)

In [73]:
# encode A5 variable 

ordinal_mapping = {k: i for i,k in enumerate(
    X_train["A5"].unique(),0
)}
ordinal_mapping

{'g': 0, 'p': 1, 'Missing': 2, 'gg': 3}

In [74]:
# replace categories with numbers

X_train["A5"] = X_train["A5"].map(ordinal_mapping)
X_test["A5"] = X_test["A5"].map(ordinal_mapping)

In [75]:
# perform ordinal encoding 

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

# initialse the encoder 

enc = OrdinalEncoder()

# but ordinal encoder will encode complete dataset
# so we use columntransofmer to encode only selection of variables 

vars_categorical = X_train.select_dtypes(include="O").columns.to_list()

# list containing remaining variables 

vars_remainder = X_train.select_dtypes(exclude="O").columns.to_list()

# columntransformer to encode categorical variables

ct = ColumnTransformer([("encoder",enc,vars_categorical)],
                       remainder="passthrough")

# fit the encoder to train data 

ct.fit(X_train)

# encode categorical variable in train and test sets

X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [76]:
#X_train_enc.head() # so this is ndarray
# and we need to convert to df to get more info

X_train_enc = pd.DataFrame(X_train_enc, columns=vars_categorical+vars_remainder)
X_test_enc = pd.DataFrame(X_test_enc, columns=vars_categorical+vars_remainder)

X_train_enc.head(10)

Unnamed: 0,A1,A4,A6,A9,A10,A12,A13,A2,A3,A5,A7,A8,A11,A14,A15,A7_v,A7_h,A7_ff,A7_bb,A7_Missing
0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,20.83,8.5,0.0,294.0,0.165,0.0,0.0,351.0,1.0,0.0,0.0,0.0,0.0
1,1.0,2.0,11.0,1.0,0.0,1.0,0.0,60.58,16.5,0.0,294.0,11.0,0.0,21.0,10561.0,1.0,0.0,0.0,0.0,0.0
2,1.0,2.0,2.0,0.0,0.0,0.0,0.0,22.67,7.0,0.0,294.0,0.165,0.0,160.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.0,2.0,11.0,0.0,0.0,0.0,0.0,20.08,0.25,0.0,294.0,0.125,0.0,200.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2.0,2.0,9.0,0.0,0.0,1.0,0.0,21.83,1.54,0.0,294.0,0.085,0.0,356.0,0.0,1.0,0.0,0.0,0.0,0.0
5,2.0,2.0,1.0,0.0,0.0,0.0,0.0,42.75,4.085,0.0,294.0,0.04,0.0,108.0,100.0,1.0,0.0,0.0,0.0,0.0
6,1.0,2.0,7.0,0.0,0.0,0.0,0.0,19.17,5.415,0.0,107.0,0.29,0.0,80.0,484.0,0.0,1.0,0.0,0.0,0.0
7,2.0,3.0,6.0,0.0,0.0,0.0,0.0,51.83,3.0,1.0,43.0,1.5,0.0,180.0,4.0,0.0,0.0,1.0,0.0,0.0
8,2.0,2.0,5.0,1.0,1.0,0.0,0.0,54.83,15.5,0.0,6.0,0.0,20.0,152.0,130.0,0.0,0.0,0.0,0.0,0.0
9,1.0,2.0,11.0,1.0,1.0,0.0,0.0,25.42,1.125,0.0,294.0,1.29,2.0,200.0,0.0,1.0,0.0,0.0,0.0,0.0


### Target mean encoding

In [77]:
# find mean target value per category of a variable and store in dict

mapping = y_train.groupby(X_train["A10"]).mean().to_dict()
mapping

{'f': 0.25, 't': 0.7122641509433962}

In [78]:
# replace categories with mean mapping
X_train["A10"] = X_train["A10"].map(mapping)
X_test["A10"] = X_test["A10"].map(mapping)

In [79]:
# perform target encoding

from category_encoders import TargetEncoder
encoder = TargetEncoder()

In [80]:
encoder.fit(X_train,y_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

print(X_train_enc)

           A1     A2      A3        A4  A5        A6   A7      A8        A9  \
645  0.464968  20.83   8.500  0.498701   0  0.392170  294   0.165  0.074803   
163  0.464968  60.58  16.500  0.498701   0  0.672224  294  11.000  0.793893   
380  0.464968  22.67   7.000  0.498701   0  0.392170  294   0.165  0.074803   
602  0.436782  20.08   0.250  0.498701   0  0.672224  294   0.125  0.074803   
431  0.436782  21.83   1.540  0.498701   0  0.300346  294   0.085  0.074803   
..        ...    ...     ...       ...  ..       ...  ...     ...       ...   
71   0.464968  38.58   5.000  0.498701   0  0.614259  294  13.500  0.793893   
106  0.464968  25.00  11.000  0.241942   1  0.382739  294   4.500  0.793893   
270  0.436782  32.33   2.500  0.498701   0  0.392170  294   1.250  0.074803   
435  0.436782  19.58   0.585  0.498701   0  0.156641   43   0.000  0.074803   
102  0.436782  25.00  12.000  0.498701   0  0.300346  294   2.250  0.793893   

          A10  A11       A12       A13    A14    A1

In [81]:
# adding smoothing
encoder_1 = TargetEncoder(min_samples_leaf=25,smoothing=1.0)
encoder_1.fit(X_train,y_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

print(X_train_enc)

           A1     A2      A3        A4  A5        A6   A7      A8        A9  \
645  0.464968  20.83   8.500  0.498701   0  0.392170  294   0.165  0.074803   
163  0.464968  60.58  16.500  0.498701   0  0.672224  294  11.000  0.793893   
380  0.464968  22.67   7.000  0.498701   0  0.392170  294   0.165  0.074803   
602  0.436782  20.08   0.250  0.498701   0  0.672224  294   0.125  0.074803   
431  0.436782  21.83   1.540  0.498701   0  0.300346  294   0.085  0.074803   
..        ...    ...     ...       ...  ..       ...  ...     ...       ...   
71   0.464968  38.58   5.000  0.498701   0  0.614259  294  13.500  0.793893   
106  0.464968  25.00  11.000  0.241942   1  0.382739  294   4.500  0.793893   
270  0.436782  32.33   2.500  0.498701   0  0.392170  294   1.250  0.074803   
435  0.436782  19.58   0.585  0.498701   0  0.156641   43   0.000  0.074803   
102  0.436782  25.00  12.000  0.498701   0  0.300346  294   2.250  0.793893   

          A10  A11       A12       A13    A14    A1