## Encoding Categorical coluns with lots of unique values for regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
import category_encoders as ce
from category_encoders.helmert import HelmertEncoder

### Function which performs Binary Encoding
This function takes the data and the column number as the input. If the data column is a categorical variable, the function returns an encoding dataframe for the column. If the data is not a categorical variable, the function returns an empty dataframe.

In [160]:
def dataEncoderBinary(data,i):
    encoders = {}
    temp = data.columns[i]
    #data[temp].dtypes
    if data[temp].dtypes == "object":
        print("Using Binary Encoding")
        print("Total no of unique column elements",data[temp].nunique())
        print("Total no of column elements",data[temp].count())
        encoder = ce.BinaryEncoder(cols=[temp])
        dfbin = encoder.fit_transform(data[temp])
        data = pd.concat([data, dfbin],axis = 1)
        data = data.drop([temp], axis = 1)
        print("Returning Encoding DataFrame")
        return dfbin
    else:
        print("Not a Categorical Variable")
        print("Returning Empty DataFrame")
        column_names = ["Empty"]
        emptyDF = pd.DataFrame(columns = column_names)
        return emptyDF

### Function that performs Base N encoding
This function takes the data, base and the column number as the input. If the data column is a categorical variable, the function returns an encoding dataframe for the column. If the data is not a categorical variable, the function returns an empty dataframe.

In [161]:
def dataEncoderBaseN(data, n, i):
    encoders = {}
    temp = data.columns[i]
    if data[temp].dtypes == "object":
        print("Using Base N Encoding")
        print("The Base is", n)
        print("Total no of unique column elements",data[temp].nunique())
        print("Total no of column elements",data[temp].count())
        encoder = ce.BaseNEncoder(cols=[temp], base = n)
        dfhelm = encoder.fit_transform(data[temp])
        data = pd.concat([data, dfhelm],axis = 1)
        data = data.drop([temp], axis = 1)
        print("Returning Encoding DataFrame")
        return dfhelm
    else:
        print("Not a Categorical Variable")
        print("Returning Empty DataFrame")
        column_names = ["Empty"]
        emptyDF = pd.DataFrame(columns = column_names)
        return emptyDF

### Testing the Functions
The sample dataset contains 5 features out of which 2 are categorical variables while 3 are non categorical variables

In [151]:
df = pd.read_csv("mi-to-provinces-2013-11-01.csv")
data = pd.DataFrame(df)
data.head()

Unnamed: 0,datetime,CellID,provinceName,cell2Province,Province2cell
0,2013-11-01 00:00:00,1,MILANO,0.1894,0.0541
1,2013-11-01 00:00:00,1,PAVIA,0.0273,
2,2013-11-01 00:00:00,1,TRENTO,0.0261,
3,2013-11-01 00:00:00,2,MILANO,0.1922,0.0556
4,2013-11-01 00:00:00,2,PAVIA,0.0273,


In [152]:
data.dtypes

datetime          object
CellID             int64
provinceName      object
cell2Province    float64
Province2cell    float64
dtype: object

In [153]:
print(data.isnull().sum())

datetime              0
CellID                0
provinceName          0
cell2Province    899375
Province2cell    859676
dtype: int64


In [165]:
for j in range(len(data.columns)):
    print("For the column number", j+1)
    newData = dataEncoderBinary(data,j)
    print(newData.head(3))
    print("-------------------------------------------------------------------------------------")

For the column number 1
Using Binary Encoding
Total no of unique column elements 24
Total no of column elements 2307306
Returning Encoding DataFrame
   datetime_0  datetime_1  datetime_2  datetime_3  datetime_4  datetime_5
0           0           0           0           0           0           1
1           0           0           0           0           0           1
2           0           0           0           0           0           1
-------------------------------------------------------------------------------------
For the column number 2
Not a Categorical Variable
Returning Empty DataFrame
Empty DataFrame
Columns: [Empty]
Index: []
-------------------------------------------------------------------------------------
For the column number 3
Using Binary Encoding
Total no of unique column elements 110
Total no of column elements 2307306
Returning Encoding DataFrame
   provinceName_0  provinceName_1  provinceName_2  provinceName_3  \
0               0               0           

In [166]:
for j in range(len(data.columns)):
    print("For the column number", j+1)
    newData = dataEncoderBaseN(data, 12, j)
    print(newData.head(3))
    print("-------------------------------------------------------------------------------------")

For the column number 1
Using Base N Encoding
The Base is 12
Total no of unique column elements 24
Total no of column elements 2307306
Returning Encoding DataFrame
   datetime_0  datetime_1  datetime_2
0           0           0           1
1           0           0           1
2           0           0           1
-------------------------------------------------------------------------------------
For the column number 2
Not a Categorical Variable
Returning Empty DataFrame
Empty DataFrame
Columns: [Empty]
Index: []
-------------------------------------------------------------------------------------
For the column number 3
Using Base N Encoding
The Base is 12
Total no of unique column elements 110
Total no of column elements 2307306
Returning Encoding DataFrame
   provinceName_0  provinceName_1  provinceName_2
0               0               0               1
1               0               0               2
2               0               0               3
---------------------------

### End of File