# One Hot Encoding in ML

In [2]:
"""
One-Hot Encoding is another popular technique for
treating categorical variables.

Many machine learning algorithms cannot work with
categorical data directly.

This data ( categorical data ) must be converted into
numbers.

One hot encoding required for both input and output
variables that are in category.

It will help ML algorithm for better result.
"""

'\nOne-Hot Encoding is another popular technique for\ntreating categorical variables.\n\nMany machine learning algorithms cannot work with\ncategorical data directly.\n\nThis data ( categorical data ) must be converted into\nnumbers.\n\nOne hot encoding required for both input and output\nvariables that are in category.\n\nIt will help ML algorithm for better result.\n'

In [3]:
"""
It simply make additional features based on the number
of unique values in the categorical feature.

Every unique value in the category will be added as a
feature.

One-Hot Encoding is the process of creating dummy
variables.
"""

'\nIt simply make additional features based on the number\nof unique values in the categorical feature.\n\nEvery unique value in the category will be added as a\nfeature.\n\nOne-Hot Encoding is the process of creating dummy\nvariables.\n'

In [6]:
import pandas as pd

df = pd.read_csv("C:\\Users\\Felix ITs 01\\Desktop\\tips.csv")

df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [7]:
df.shape

(244, 7)

In [8]:
df.isnull()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
239,False,False,False,False,False,False,False
240,False,False,False,False,False,False,False
241,False,False,False,False,False,False,False
242,False,False,False,False,False,False,False


In [9]:
df.notnull()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...
239,True,True,True,True,True,True,True
240,True,True,True,True,True,True,True
241,True,True,True,True,True,True,True
242,True,True,True,True,True,True,True


In [11]:
# Dummy Variables

dummy_df = pd.get_dummies(df)

dummy_df

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.50,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,1,0,0,1,0,0,1,0
240,27.18,2.00,2,1,0,0,1,0,1,0,0,1,0
241,22.67,2.00,2,0,1,0,1,0,1,0,0,1,0
242,17.82,1.75,2,0,1,1,0,0,1,0,0,1,0


In [12]:
"""
get_dummies() method

pandas.get_dummies() is used for data manipulation.

It converts cate. (categorical) data into dummy or
variables.

"""

'\nget_dummies() method\n\npandas.get_dummies() is used for data manipulation.\n\nIt converts cate. (categorical) data into dummy or\nvariables.\n\n'

In [13]:
# drop_first
# drop_first : bool, default False
# Whether to get k-1 dummies out of k cate. (categorical) levels by removing the first level

In [15]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,2,0,0,0,1,0,0
1,10.34,1.66,3,1,0,0,1,0,0
2,21.01,3.50,3,1,0,0,1,0,0
3,23.68,3.31,2,1,0,0,1,0,0
4,24.59,3.61,4,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,1,0,0,0
240,27.18,2.00,2,0,1,1,0,0,0
241,22.67,2.00,2,1,1,1,0,0,0
242,17.82,1.75,2,1,0,1,0,0,0


# One-Hot Encoding with Scikit-learn Module in Python for ML Model

In [18]:
from sklearn.preprocessing import OneHotEncoder

ohenc = OneHotEncoder(sparse=False)

oh_enc_arr = ohenc.fit_transform(df[['sex','smoker','day','time']])

oh_enc_arr

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [19]:
# Get column names

dummy_df.keys()

Index(['total_bill', 'tip', 'size', 'sex_Female', 'sex_Male', 'smoker_No',
       'smoker_Yes', 'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur',
       'time_Dinner', 'time_Lunch'],
      dtype='object')

In [20]:
oh_enc_df = pd.DataFrame(oh_enc_arr, columns=['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'])

oh_enc_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
