# Feature Encoding
Feature encoding is the process of converting categorical data into numerical data.

There are many types of encoding.

- Label Encoding
- One Hot Encoding
- Ordinal Encoding
- Mean Encoding
- binary encoding

In [2]:
# import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load tips data
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
#lets values counts
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

#lable encoding

In [5]:
#lets encode time in label encoding with sklearn label encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,OrdinalEncoder
le=LabelEncoder()
df['encoded_time']=le.fit_transform(df['time'])
df['encoded_time'].value_counts()


encoded_time
0    176
1     68
Name: count, dtype: int64

In [6]:
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
83,32.68,5.0,Male,Yes,Thur,Lunch,2,1
206,26.59,3.41,Male,Yes,Sat,Dinner,3,0
71,17.07,3.0,Female,No,Sat,Dinner,3,0
107,25.21,4.29,Male,Yes,Sat,Dinner,2,0
21,20.29,2.75,Female,No,Sat,Dinner,2,0


#Ordinal Encoding the day column using specific order


In [7]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [8]:
#Ordinal Encoding the day column using specific order 
oe=OrdinalEncoder(categories=[['Thur','Fri','Sat','Sun']])
df['encoded_day']=oe.fit_transform(df[['day']])
df['encoded_day'].value_counts() 

encoded_day
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0


One Hot Encoding

In [10]:
#One Hot Encoding the day column using OneHotEncoder 
# Create an instance of OneHotEncoder
ohe = OneHotEncoder()

# One hot encode the 'day' column
ohe.fit_transform(df[['day']]).toarray()

# Convert the encoded array to a DataFrame
pd.DataFrame(ohe.fit_transform(df[['day']]).toarray())

#concatenate the encoded array with the original dataframe
pd.concat([df, pd.DataFrame(ohe.fit_transform(df[['day']]).toarray())], axis=1).head()



Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day,0,1,2,3
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0,0.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0,0.0,0.0,1.0,0.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0,0.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0,0.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0,0.0,0.0,1.0,0.0


# Binary Encoding

In [11]:
#istall category_encoders
# !pip install category_encoders

In [12]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [14]:
#binary encoding using day column 
from category_encoders import BinaryEncoder
binary_encoder=BinaryEncoder()
df_binary=binary_encoder.fit_transform(df[['day']])
df_binary.head()

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [15]:
df_binary.value_counts()

day_0  day_1  day_2
0      1      0        87
       0      1        76
       1      1        62
1      0      0        19
Name: count, dtype: int64

In [16]:
#use pandas for feature encoding
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [18]:
#pandas get_dummies
pd.get_dummies(df['day'])

Unnamed: 0,Thur,Fri,Sat,Sun
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
...,...,...,...,...
239,False,False,True,False
240,False,False,True,False
241,False,False,True,False
242,False,False,True,False


In [19]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
#use pandas get_dummies
get_dumies=pd.get_dummies(df, columns=['day'])
get_dumies.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Thur,day_Fri,day_Sat,day_Sun
62,11.02,1.98,Male,Yes,Dinner,2,False,False,True,False
156,48.17,5.0,Male,No,Dinner,6,False,False,False,True
138,16.0,2.0,Male,Yes,Lunch,2,True,False,False,False
8,15.04,1.96,Male,No,Dinner,2,False,False,False,True
233,10.77,1.47,Male,No,Dinner,2,False,False,True,False
