# OneHotEncodingTransformer
This notebook shows the functionality in the `OneHotEncodingTransformer` class. This transformer implements one hot encoding. It uses the scikit-learn `OneHotEncoder` and implements nice renaming of output one hot columns. <br>

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.datasets import load_diabetes

In [2]:
import tubular
from tubular.nominal import OneHotEncodingTransformer

In [3]:
tubular.__version__

'0.3.0'

## Load diabetes dataset from sklearn
We also create a categorical column from `bmi` and treat it as unordered for demonstration purposes in this notebook.

In [4]:
diabetes = load_diabetes(return_X_y=False, as_frame=True)["data"]

In [5]:
diabetes["bmi_cut"] = pd.cut(diabetes["bmi"], bins=4)

In [6]:
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,bmi_cut
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,"(0.0401, 0.105]"
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,"(-0.0905, -0.0251]"
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,"(0.0401, 0.105]"
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,"(-0.0251, 0.0401]"
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,"(-0.0905, -0.0251]"


In [7]:
diabetes["bmi_cut"].value_counts(dropna=False) / diabetes.shape[0]

(-0.0251, 0.0401]     0.459276
(-0.0905, -0.0251]    0.332579
(0.0401, 0.105]       0.185520
(0.105, 0.171]        0.022624
Name: bmi_cut, dtype: float64

## Simple usage

### Initialising OneHotEncodingTransformer

In [8]:
ohe_1 = OneHotEncodingTransformer(columns="bmi_cut", copy=True, verbose=True)

BaseTransformer.__init__() called


### OneHotEncodingTransformer fit
The `fit` identifies the levels that will be used to create the dummy variables in `transform`, it method must be run before the `transform` method.

In [9]:
ohe_1.fit(diabetes)

BaseTransformer.fit() called


OneHotEncodingTransformer(columns=['bmi_cut'], verbose=True)

### OneHotEncodingTransformer transform
By default the `OneHotEncodingTransformer` adds the dummy columns onto the original data. <br>
The default naming of dummy columns is; `[categorical feature][separator][category level]`. The separator is set in the `separator` argument when initialising the `OneHotEncodingTransformer` object. <br>

In [10]:
diabetes_2 = ohe_1.transform(diabetes)

BaseTransformer.transform() called


In [11]:
diabetes_2.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,bmi_cut,"bmi_cut_(-0.0905, -0.0251]","bmi_cut_(-0.0251, 0.0401]","bmi_cut_(0.0401, 0.105]","bmi_cut_(0.105, 0.171]"
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,"(0.0401, 0.105]",0.0,0.0,1.0,0.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,"(-0.0905, -0.0251]",1.0,0.0,0.0,0.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,"(0.0401, 0.105]",0.0,0.0,1.0,0.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,"(-0.0251, 0.0401]",0.0,1.0,0.0,0.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,"(-0.0905, -0.0251]",1.0,0.0,0.0,0.0


## Dropping the original one hot encoded columns
The columns specified to be one hot encoded can be dropped by using the `drop_original` argument when initialising the `OneHotEncodingTransformer` object.

In [12]:
ohe_2 = OneHotEncodingTransformer(
    columns="bmi_cut", drop_original=True, copy=True, verbose=True
)

BaseTransformer.__init__() called


In [13]:
ohe_2.fit(diabetes)

BaseTransformer.fit() called


OneHotEncodingTransformer(columns=['bmi_cut'], drop_original=True, verbose=True)

In [14]:
diabetes_3 = ohe_2.transform(diabetes)

BaseTransformer.transform() called


In [15]:
diabetes_3.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,"bmi_cut_(-0.0905, -0.0251]","bmi_cut_(-0.0251, 0.0401]","bmi_cut_(0.0401, 0.105]","bmi_cut_(0.105, 0.171]"
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,0.0,0.0,1.0,0.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,1.0,0.0,0.0,0.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,0.0,0.0,1.0,0.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,0.0,1.0,0.0,0.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,1.0,0.0,0.0,0.0
