# **Feature Encoding**

In [239]:
import pandas as pd
import numpy as np

## **Encoding Ordinal Data**

In [240]:
df=pd.read_csv('./data/customer.csv')
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [241]:
df = df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [242]:
X=df.drop('purchased',axis=1)
y=df['purchased']

In [243]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

## 1. Ordinal Encoding
- It is mainly used for input features.
- We specify the order of the classes based on their inherent ranking or order.

In [244]:
from sklearn.preprocessing import OrdinalEncoder

In [245]:
oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

In [246]:
oe.fit(X_train)

In [247]:
oe_X_train_encoded=oe.transform(X_train)
oe_X_test_encoded=oe.transform(X_test)

### `Before`

In [248]:
X_train.head()

Unnamed: 0,review,education
6,Good,School
41,Good,PG
46,Poor,PG
47,Good,PG
15,Poor,UG


### `After`

In [249]:
oe_X_train_encoded

array([[2., 0.],
       [2., 2.],
       [0., 2.],
       [2., 2.],
       [0., 1.],
       [2., 1.],
       [0., 1.],
       [1., 2.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [1., 1.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [1., 1.],
       [2., 1.],
       [2., 1.],
       [0., 1.],
       [1., 2.],
       [2., 2.],
       [0., 2.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [2., 1.],
       [0., 2.],
       [2., 0.],
       [2., 1.],
       [1., 0.],
       [0., 0.],
       [2., 2.],
       [0., 2.],
       [0., 0.],
       [2., 0.]])

In [250]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

## 2. Label Encoding
- It is mainly used for output (target) variables.
- It assigns a unique integer to each category, but the encoding is not random; the integers are assigned based on the alphabetical order of the categories.

In [251]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [252]:
le.fit(y_train)

In [253]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [254]:
le_y_train_encoded=le.transform(y_train)
le_y_test_encoded=le.transform(y_test)

### `Before`

In [255]:
y_train.head()

6      No
41    Yes
46     No
47    Yes
15     No
Name: purchased, dtype: object

### `After`

In [256]:
le_y_train_encoded

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0])

## **Encoding Nominal Data**

In [257]:
df=pd.read_csv("./data/cars.csv")
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [258]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

## 1. One-Hot Encoding

### (II) **Using Pandas**

In [282]:
pd.get_dummies(df,columns=['fuel','owner'],dtype='int32')

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### (II) **K-1 OneHotEncoding**(*dummy variable trap*)

In [283]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True,dtype='int32')

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### (III) **Using Scikit_Learn**

In [261]:
from sklearn.preprocessing import OneHotEncoder
X=df.drop('selling_price',axis=1)
y=df['selling_price']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [273]:
ohe=OneHotEncoder(drop='first',sparse_output=False,dtype='int32')

In [274]:
ohe_X_train_encoded=ohe.fit_transform(X_train[['fuel','owner']])
ohe_X_test_encoded=ohe.transform(X_test[['fuel','owner']])

In [275]:
ohe_X_train_encoded

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]])

In [276]:
X_train[['brand','km_driven']].values

array([['Maruti', 120000],
       ['Toyota', 100000],
       ['BMW', 39000],
       ...,
       ['Hyundai', 35000],
       ['Maruti', 27000],
       ['Maruti', 70000]], dtype=object)

In [277]:
np.hstack((X_train[['brand','km_driven']].values,ohe_X_train_encoded))

array([['Maruti', 120000, 0, ..., 0, 0, 1],
       ['Toyota', 100000, 1, ..., 0, 0, 0],
       ['BMW', 39000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

### (IV) **OneHotEncoding with Top Categories**

In [278]:
counts=df['brand'].value_counts()

In [279]:
df['brand'].nunique()
threshold=100

In [280]:
repl = counts[counts <= threshold].index

In [288]:
pd.get_dummies(df['brand'].replace(repl,"uncommon"),dtype='int32').sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
7070,0,0,0,0,0,0,1,0,0,0,0,0,0
7345,0,0,0,0,0,1,0,0,0,0,0,0,0
5978,0,0,0,0,0,0,1,0,0,0,0,0,0
7298,0,0,0,0,0,0,1,0,0,0,0,0,0
5614,0,0,0,0,0,0,0,0,0,0,0,0,1
