# Feature Engineering
## Categorical

In [191]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

### One Hot Encoding

In [2]:
inputs = [
    {'city': 'New York', 'temperature': 72.3},
    {'city': 'London', 'temperature': 56.3},
    {'city': 'Barcelona', 'temperature': 78.2},
    {'city': 'Austin', 'temperature': 102.3}
]

In [3]:
vec = DictVectorizer()

vec.fit_transform(inputs).toarray()

array([[  0. ,   0. ,   0. ,   1. ,  72.3],
       [  0. ,   0. ,   1. ,   0. ,  56.3],
       [  0. ,   1. ,   0. ,   0. ,  78.2],
       [  1. ,   0. ,   0. ,   0. , 102.3]])

In [4]:
vec.get_feature_names_out()

array(['city=Austin', 'city=Barcelona', 'city=London', 'city=New York',
       'temperature'], dtype=object)

In [5]:
# Add another row for Austin
vec.transform([{'city':'Austin', 'temperature': 104}]).toarray()

array([[  1.,   0.,   0.,   0., 104.]])

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
inputs_df = pd.DataFrame(inputs)

In [8]:
# It's a good idea to have a list of categories if possible:
categories = [('city',['Austin','Barcelona','London','New York', 'Tokyo'])]

In [9]:
ohe_columns = [x[0] for x in categories]
ohe_categories = [x[1] for x in categories]

In [10]:
ohe = OneHotEncoder(categories=ohe_categories, handle_unknown='ignore')

In [11]:
# In order to apply this to a dataframe we need some tooling from sklearn
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [12]:
col_transformer = make_column_transformer((ohe, ohe_columns), remainder='passthrough')


In [13]:
transformed=col_transformer.fit_transform(inputs_df)

In [14]:
transformed

array([[  0. ,   0. ,   0. ,   1. ,   0. ,  72.3],
       [  0. ,   0. ,   1. ,   0. ,   0. ,  56.3],
       [  0. ,   1. ,   0. ,   0. ,   0. ,  78.2],
       [  1. ,   0. ,   0. ,   0. ,   0. , 102.3]])

Adding an observation for a city that's there (Austin) and an unseen datapoint (SantaFe)

In [15]:
new_inputs = pd.concat((inputs_df, pd.DataFrame([{'city':'Austin', 'temperature':105}, {'city':'SantaFe', 'temperature':115}]))).reset_index(drop=True)

In [16]:
col_transformer.fit_transform(new_inputs)

array([[  0. ,   0. ,   0. ,   1. ,   0. ,  72.3],
       [  0. ,   0. ,   1. ,   0. ,   0. ,  56.3],
       [  0. ,   1. ,   0. ,   0. ,   0. ,  78.2],
       [  1. ,   0. ,   0. ,   0. ,   0. , 102.3],
       [  1. ,   0. ,   0. ,   0. ,   0. , 105. ],
       [  0. ,   0. ,   0. ,   0. ,   0. , 115. ]])

In [17]:
## You could create the inputs list:
x = [[city] for city in inputs_df.city]

In [18]:
y = OneHotEncoder().fit_transform(x).toarray()


In [19]:
y

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

#### Another Approach, Label Binarizer

In [20]:
from sklearn.preprocessing import LabelBinarizer


In [21]:
y = LabelBinarizer().fit_transform(inputs_df.city)


In [22]:
y

array([[0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0]])

Now you need to do some manual work...

#### In Pandas

In [23]:
pd.get_dummies(inputs_df, prefix='city')

Unnamed: 0,temperature,city_Austin,city_Barcelona,city_London,city_New York
0,72.3,False,False,False,True
1,56.3,False,False,True,False
2,78.2,False,True,False,False
3,102.3,True,False,False,False


In [25]:
inputs_df['city'] = inputs_df['city'].astype('category')

In [26]:
inputs_df

Unnamed: 0,city,temperature
0,New York,72.3
1,London,56.3
2,Barcelona,78.2
3,Austin,102.3


In [27]:
for column_name, category_name in categories:
    inputs_df[column_name] = inputs_df[column_name].cat.set_categories(category_name)

In [29]:
pd.get_dummies(inputs_df, prefix='city')

Unnamed: 0,temperature,city_Austin,city_Barcelona,city_London,city_New York,city_Tokyo
0,72.3,False,False,False,True,False
1,56.3,False,False,True,False,False
2,78.2,False,True,False,False,False
3,102.3,True,False,False,False,False


## Label Encoder

In [178]:
def inputs():
    inputs = [
    {'city': 'New York', 'temperature': 72.3},
    {'city': 'London', 'temperature': 56.3},
    {'city': 'Barcelona', 'temperature': 78.2},
    {'city': 'Austin', 'temperature': 102.3}
    ]
    inputs_df = pd.DataFrame(inputs)
    return inputs_df

In [192]:
from sklearn.preprocessing import LabelEncoder

In [193]:
le = LabelEncoder()

In [194]:
inputs_df = inputs()
inputs_df.city = le.fit_transform(inputs_df.city)

In [195]:
inputs_df

Unnamed: 0,city,temperature
0,3,72.3
1,2,56.3
2,1,78.2
3,0,102.3


In [196]:
inputs_df = inputs()
new_inputs = pd.concat((inputs_df, pd.DataFrame([{'city':'Austin', 'temperature':105}]))).reset_index(drop=True)

In [197]:
new_inputs.city = le.transform(new_inputs.city)

In [198]:
new_inputs

Unnamed: 0,city,temperature
0,3,72.3
1,2,56.3
2,1,78.2
3,0,102.3
4,0,105.0


Let's see how it handles unseen data:

In [199]:
inputs_df = inputs()
new_inputs = pd.concat((inputs_df, pd.DataFrame([{'city':'Austin', 'temperature':105}, {'city':'SantaFe', 'temperature':115}]))).reset_index(drop=True)

In [200]:
le.transform(new_inputs.city)

ValueError: y contains previously unseen labels: 'SantaFe'

We will write a little utility to add an "other"

In [201]:
le.classes_

array(['Austin', 'Barcelona', 'London', 'New York'], dtype=object)

In [209]:
def add_other_to_label_encoder(le:LabelEncoder)->LabelEncoder:
    import bisect
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, 'other')
    le.classes_ = np.array(le_classes)
    return le

def replace_cities_not_in_label_encoder(data:pd.Series, le:LabelEncoder)->pd.DataFrame:
    data = pd.Series(['other' if d not in le.classes_ else d for d in data])
    return data

In [203]:
le = add_other_to_label_encoder(le)

In [211]:
new_inputs.city = replace_cities_not_in_label_encoder(new_inputs.city, le)

In [220]:
new_inputs.city = le.transform(new_inputs.city)

In [221]:
new_inputs

Unnamed: 0,city,temperature
0,3,72.3
1,2,56.3
2,1,78.2
3,0,102.3
4,0,105.0
5,4,115.0


In [222]:
le.inverse_transform(new_inputs.city)

array(['New York', 'London', 'Barcelona', 'Austin', 'Austin', 'other'],
      dtype='<U9')

## Hashing Encoder

The hashing trick passes the input values through a hash to reduce the cardinality

In [288]:
from category_encoders.hashing import HashingEncoder # (pip install category_encoders)
ce_hash = HashingEncoder(cols = ['city'])

Note that here since we are using N=4, this is essentially equivalent to one hot encoding

In [289]:
inputs_df = inputs()
df2 = ce_hash.hashing_trick(inputs_df,N=4)


In [290]:
df2

Unnamed: 0,col_0,col_1,col_2,col_3
0,0,0,1,1
1,1,0,0,1
2,0,1,0,1
3,0,0,0,2


In [291]:
inputs_df = inputs()
ce_hash = HashingEncoder(cols = ['city'], n_components=8, return_df=True)
df2 = ce_hash.fit_transform(inputs_df)


  elif pd.api.types.is_categorical_dtype(cols):


In [292]:
df2

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,temperature
0,0,0,0,1,0,0,0,0,72.3
1,0,0,0,1,0,0,0,0,56.3
2,0,0,0,0,0,0,0,1,78.2
3,0,0,0,0,0,0,0,1,102.3


In [293]:
inputs_df = inputs()
new_inputs = pd.concat((inputs_df, pd.DataFrame([{'city':'Austin', 'temperature':105}, {'city':'SantaFe', 'temperature':115}]))).reset_index(drop=True)

In [294]:
ce_hash.transform(new_inputs)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,temperature
0,0,0,0,1,0,0,0,0,72.3
1,0,0,0,1,0,0,0,0,56.3
2,0,0,0,0,0,0,0,1,78.2
3,0,0,0,0,0,0,0,1,102.3
4,0,0,0,0,0,0,0,1,105.0
5,0,0,0,0,0,1,0,0,115.0


More resources:
https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087