# Count encoding or frequency encoding 

**Definition**
  * Categories are replaced by the count or percentage of observations that show that category in the dataset 
  * Captures the representation of each label in a dataset
  * Very popular encoding method in Kaggle competitions 
  * Assumption: the numbmer of observations shown by each category is predictive of the target 
  * Count encoding
    * ![](images/count_encoding_example.png)
  * Frequency encoding
    * ![](images/frequency_encoding_example.png)

**Advantages**
  * Straightforward to implement 
  * Does not expand the feature space 
  * Can work well enough with tree based algorithms 

**Limitations**
  * Not suitable for linear models 
  * Does not handle new categories in test set automatically 
  * If 2 different categories appear the same amount of times in the dataset, they will be replaced by the same number: may lose valuable information 

# Import data

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/house_price/train.csv', 
                 usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])

In [3]:
df

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000
...,...,...,...,...
1455,Gilbert,VinylSd,VinylSd,175000
1456,NWAmes,Plywood,Plywood,210000
1457,Crawfor,CemntBd,CmentBd,266500
1458,NAmes,MetalSd,MetalSd,142125


# Explore cardinality

In [4]:
for col in ['Neighborhood', 'Exterior1st', 'Exterior2nd']:
    print(f"{col}: {len(df[col].unique())} labels")

Neighborhood: 25 labels
Exterior1st: 15 labels
Exterior2nd: 16 labels


In [5]:
pd.concat([df['Neighborhood'].value_counts(), df['Neighborhood'].value_counts()/len(df)], 
          keys=['count', 'frequency'], axis=1)

Unnamed: 0,count,frequency
NAmes,225,0.15411
CollgCr,150,0.10274
OldTown,113,0.077397
Edwards,100,0.068493
Somerst,86,0.058904
Gilbert,79,0.05411
NridgHt,77,0.05274
Sawyer,74,0.050685
NWAmes,73,0.05
SawyerW,59,0.040411


# Split data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[['Neighborhood', 'Exterior1st', 'Exterior2nd']],
                                                   df['SalePrice'], test_size=0.3, random_state=0)

In [7]:
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,CollgCr,VinylSd,VinylSd
682,ClearCr,Wd Sdng,Wd Sdng
960,BrkSide,Wd Sdng,Plywood
1384,Edwards,WdShing,Wd Shng
1100,SWISU,Wd Sdng,Wd Sdng


# Count encoding

In [8]:
# check the count and frequency again of the train data 
pd.concat([X_train['Neighborhood'].value_counts(), X_train['Neighborhood'].value_counts()/len(X_train)], 
          keys=['count', 'frequency'], axis=1)

Unnamed: 0,count,frequency
NAmes,151,0.14775
CollgCr,105,0.10274
OldTown,73,0.071429
Edwards,71,0.069472
Sawyer,61,0.059687
Somerst,56,0.054795
Gilbert,55,0.053816
NWAmes,51,0.049902
NridgHt,51,0.049902
SawyerW,45,0.044031


## Create count mapper 

In [9]:
count_mapper = X_train['Neighborhood'].value_counts().to_dict()
count_mapper

{'NAmes': 151,
 'CollgCr': 105,
 'OldTown': 73,
 'Edwards': 71,
 'Sawyer': 61,
 'Somerst': 56,
 'Gilbert': 55,
 'NWAmes': 51,
 'NridgHt': 51,
 'SawyerW': 45,
 'BrkSide': 41,
 'Mitchel': 36,
 'Crawfor': 35,
 'Timber': 30,
 'NoRidge': 30,
 'IDOTRR': 24,
 'ClearCr': 24,
 'SWISU': 18,
 'StoneBr': 16,
 'MeadowV': 12,
 'Blmngtn': 12,
 'BrDale': 10,
 'NPkVill': 7,
 'Veenker': 6,
 'Blueste': 2}

## Apply count mapper 

In [10]:
X_train['Neighborhood'] = X_train['Neighborhood'].map(count_mapper)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,105,VinylSd,VinylSd
682,24,Wd Sdng,Wd Sdng
960,41,Wd Sdng,Plywood
1384,71,WdShing,Wd Shng
1100,18,Wd Sdng,Wd Sdng


## Multiple columns

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df[['Neighborhood', 'Exterior1st', 'Exterior2nd']],
                                                   df['SalePrice'], test_size=0.3, random_state=0)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,CollgCr,VinylSd,VinylSd
682,ClearCr,Wd Sdng,Wd Sdng
960,BrkSide,Wd Sdng,Plywood
1384,Edwards,WdShing,Wd Shng
1100,SWISU,Wd Sdng,Wd Sdng


In [12]:
count_mapper = {}

for col in ['Neighborhood', 'Exterior1st', 'Exterior2nd']:
    count_mapper[col] = X_train[col].value_counts().to_dict()

In [13]:
count_mapper

{'Neighborhood': {'NAmes': 151,
  'CollgCr': 105,
  'OldTown': 73,
  'Edwards': 71,
  'Sawyer': 61,
  'Somerst': 56,
  'Gilbert': 55,
  'NWAmes': 51,
  'NridgHt': 51,
  'SawyerW': 45,
  'BrkSide': 41,
  'Mitchel': 36,
  'Crawfor': 35,
  'Timber': 30,
  'NoRidge': 30,
  'IDOTRR': 24,
  'ClearCr': 24,
  'SWISU': 18,
  'StoneBr': 16,
  'MeadowV': 12,
  'Blmngtn': 12,
  'BrDale': 10,
  'NPkVill': 7,
  'Veenker': 6,
  'Blueste': 2},
 'Exterior1st': {'VinylSd': 364,
  'HdBoard': 153,
  'Wd Sdng': 148,
  'MetalSd': 138,
  'Plywood': 86,
  'CemntBd': 39,
  'BrkFace': 35,
  'WdShing': 21,
  'Stucco': 17,
  'AsbShng': 15,
  'Stone': 2,
  'BrkComm': 1,
  'CBlock': 1,
  'ImStucc': 1,
  'AsphShn': 1},
 'Exterior2nd': {'VinylSd': 353,
  'Wd Sdng': 142,
  'HdBoard': 141,
  'MetalSd': 136,
  'Plywood': 112,
  'CmentBd': 39,
  'Wd Shng': 29,
  'BrkFace': 18,
  'AsbShng': 17,
  'Stucco': 16,
  'ImStucc': 8,
  'Stone': 4,
  'Brk Cmn': 4,
  'CBlock': 1,
  'Other': 1,
  'AsphShn': 1}}

In [14]:
for col in ['Neighborhood', 'Exterior1st', 'Exterior2nd']:
    X_train[col] = X_train[col].map(count_mapper[col])
    
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,105,364,353
682,24,148,142
960,41,148,112
1384,71,21,29
1100,18,148,142


# Frequency encoder

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df[['Neighborhood', 'Exterior1st', 'Exterior2nd']],
                                                   df['SalePrice'], test_size=0.3, random_state=0)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,CollgCr,VinylSd,VinylSd
682,ClearCr,Wd Sdng,Wd Sdng
960,BrkSide,Wd Sdng,Plywood
1384,Edwards,WdShing,Wd Shng
1100,SWISU,Wd Sdng,Wd Sdng


In [16]:
freq_mapper = (X_train['Neighborhood'].value_counts()/len(X_train)).to_dict()
freq_mapper

{'NAmes': 0.14774951076320939,
 'CollgCr': 0.10273972602739725,
 'OldTown': 0.07142857142857142,
 'Edwards': 0.06947162426614481,
 'Sawyer': 0.05968688845401174,
 'Somerst': 0.0547945205479452,
 'Gilbert': 0.053816046966731895,
 'NWAmes': 0.049902152641878667,
 'NridgHt': 0.049902152641878667,
 'SawyerW': 0.04403131115459882,
 'BrkSide': 0.040117416829745595,
 'Mitchel': 0.03522504892367906,
 'Crawfor': 0.03424657534246575,
 'Timber': 0.029354207436399216,
 'NoRidge': 0.029354207436399216,
 'IDOTRR': 0.023483365949119372,
 'ClearCr': 0.023483365949119372,
 'SWISU': 0.01761252446183953,
 'StoneBr': 0.015655577299412915,
 'MeadowV': 0.011741682974559686,
 'Blmngtn': 0.011741682974559686,
 'BrDale': 0.009784735812133072,
 'NPkVill': 0.00684931506849315,
 'Veenker': 0.005870841487279843,
 'Blueste': 0.0019569471624266144}

In [17]:
X_train['Neighborhood'] = X_train['Neighborhood'].map(freq_mapper)

X_train['Neighborhood'].head()

64      0.102740
682     0.023483
960     0.040117
1384    0.069472
1100    0.017613
Name: Neighborhood, dtype: float64

# When unseen data in test set

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df[['Neighborhood', 'Exterior1st', 'Exterior2nd']],
                                                   df['SalePrice'], test_size=0.3, random_state=0)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,CollgCr,VinylSd,VinylSd
682,ClearCr,Wd Sdng,Wd Sdng
960,BrkSide,Wd Sdng,Plywood
1384,Edwards,WdShing,Wd Shng
1100,SWISU,Wd Sdng,Wd Sdng


In [19]:
freq_mapper = (X_train['Neighborhood'].value_counts()/len(X_train)).to_dict()

In [20]:
unseen_labels = pd.DataFrame({'Neighborhood': ['unknwon'], 'Exterior1st': ['unknwon'], 'Exterior2nd': ['unknwon']})
X_test = X_test.append(unseen_labels, ignore_index=True, sort=False)

In [21]:
X_test.tail()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
434,Edwards,Wd Sdng,Wd Sdng
435,NoRidge,MetalSd,MetalSd
436,CollgCr,VinylSd,VinylSd
437,NridgHt,VinylSd,VinylSd
438,unknwon,unknwon,unknwon


In [22]:
# unseen data becomes NaN
X_test['Neighborhood'].map(freq_mapper)

0      0.034247
1      0.147750
2      0.040117
3      0.023483
4      0.009785
         ...   
434    0.069472
435    0.029354
436    0.102740
437    0.049902
438         NaN
Name: Neighborhood, Length: 439, dtype: float64