# Ordinal encoding 

**Definition**
  * Replacing categories with digits from 1 to n (or 0 to n-1), where n is the number of distinct categories of the variable 
  * The numbers are assigned arbitrarily
  * Good for quick benchmarking of machine learning models
  * ![](images/ordinal_encoding_example.png)

**Advantages**
  * Straightforward to implement 
  * Does not expand the feature space 
  * Can work well enough with tree based algorithms 

**Limitations**
  * Does not add extra information while encoding 
  * Not suitable for linear models 
  * Does not handle new categories in test set automatically 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 
from collections import defaultdict

In [2]:
df = pd.read_csv('../data/house_price/train.csv', usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])

In [3]:
df

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000
...,...,...,...,...
1455,Gilbert,VinylSd,VinylSd,175000
1456,NWAmes,Plywood,Plywood,210000
1457,Crawfor,CemntBd,CmentBd,266500
1458,NAmes,MetalSd,MetalSd,142125


# Explor cardinality

In [4]:
cols = ['Neighborhood', 'Exterior1st', 'Exterior2nd']

for col in cols:
    print(f"{col}: {len(df[col].unique())} labels")

Neighborhood: 25 labels
Exterior1st: 15 labels
Exterior2nd: 16 labels


In [5]:
df['Neighborhood'].unique()

array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
       'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
       'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
       'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
       'Blueste'], dtype=object)

In [6]:
df['Exterior1st'].unique()

array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
       'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
       'Stone', 'ImStucc', 'CBlock'], dtype=object)

In [7]:
df['Exterior2nd'].unique()

array(['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng',
       'CmentBd', 'BrkFace', 'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc',
       'AsphShn', 'Stone', 'Other', 'CBlock'], dtype=object)

In [8]:
df['Neighborhood'].value_counts()

NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64

# Encode one variable

## Divide Data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[cols], df['SalePrice'], test_size=0.3, random_state=0)

In [10]:
X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [11]:
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,CollgCr,VinylSd,VinylSd
682,ClearCr,Wd Sdng,Wd Sdng
960,BrkSide,Wd Sdng,Plywood
1384,Edwards,WdShing,Wd Shng
1100,SWISU,Wd Sdng,Wd Sdng


## Fit encoder and apply it

In [12]:
le = LabelEncoder()
le.fit(X_train['Neighborhood'])

LabelEncoder()

In [13]:
le.classes_

array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
       'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
       'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown',
       'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
       'Veenker'], dtype=object)

In [14]:
X_train['Neighborhood'] = le.transform(X_train['Neighborhood'])
X_test['Neighborhood'] = le.transform(X_test['Neighborhood'])

In [15]:
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,5,VinylSd,VinylSd
682,4,Wd Sdng,Wd Sdng
960,3,Wd Sdng,Plywood
1384,7,WdShing,Wd Shng
1100,18,Wd Sdng,Wd Sdng


# Encode multiple variables

## Divide Data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df[cols], df['SalePrice'], test_size=0.3, random_state=0)

In [17]:
X_train.shape, X_test.shape

((1022, 3), (438, 3))

## Fit encoder and apply it to train data

In [18]:
# create a defaultdict 
d = defaultdict(LabelEncoder)  # dictionary that returns a LabelEncoder object when an unknown key is called
d

defaultdict(sklearn.preprocessing._label.LabelEncoder, {})

In [19]:
# d[x.name] requests value of a key of each column name. As the key doesn't exist yet, it returns LabelEncoder()
# .fit_transform(x) will fit LabelEncoder with each column data, and transform the data with the saved encoder
train_transformed = X_train.apply(lambda x: d[x.name].fit_transform(x))

In [20]:
train_transformed

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,5,12,13
682,4,13,14
960,3,13,10
1384,7,14,15
1100,18,13,14
...,...,...,...
763,15,12,13
835,19,12,6
1216,19,12,13
559,0,12,13


In [21]:
d

defaultdict(sklearn.preprocessing._label.LabelEncoder,
            {'Neighborhood': LabelEncoder(),
             'Exterior1st': LabelEncoder(),
             'Exterior2nd': LabelEncoder()})

## Apply the encoder to test data

In [22]:
# d[x.name] requests value of a key of each column name, which is a fitted LabelEncoder.
# .transform(x) transforms data in each column
X_test.apply(lambda x: d[x.name].transform(x))

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
529,6,13,11
491,12,13,14
459,3,8,8
279,4,9,10
655,2,6,7
...,...,...,...
271,4,9,10
445,7,13,14
654,15,8,8
1280,5,12,13
