## Nominal / One Hot Encoding

In [1]:
# Using library of one hot encoder using sklearn

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.DataFrame({
    'color' : ['red' , 'blue' , 'green' , 'red' , 'blue']
})

In [4]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue


In [5]:
# Create an instance of One Hot Encoder
encoder = OneHotEncoder()

In [6]:
encoder.fit_transform(df[['color']])

<5x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [7]:
encoder.fit_transform(df[['color']]).toarray()
# It has sorted alphabatically

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [8]:
# fit the encoder to dataframe and transform the categorical variable
encoded = encoder.fit_transform(df[['color']])

In [9]:
pd.DataFrame(encoded.toarray() , columns = encoder.get_feature_names_out())

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


In [10]:
encoder.get_feature_names_out()

array(['color_blue', 'color_green', 'color_red'], dtype=object)

In [11]:
encoded_df = pd.DataFrame(encoded.toarray() , columns = encoder.get_feature_names_out())

In [12]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


In [13]:
# Appending this with DF

In [14]:
pd.concat([df , encoded_df] , axis = 1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,red,0.0,0.0,1.0
4,blue,1.0,0.0,0.0


In [15]:
# You can take any data set you want

In [16]:
import seaborn as sns
df=sns.load_dataset('tips') # homework

In [17]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [18]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [19]:
# Making Nominal Encoding across day

In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [21]:
encoder = OneHotEncoder()

In [22]:
encoder.fit_transform(df[['day']])

<244x4 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [23]:
encoder.fit_transform(df[['day']]).toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [24]:
encoder.get_feature_names_out()

array(['day_Fri', 'day_Sat', 'day_Sun', 'day_Thur'], dtype=object)

In [25]:
encoded_data = pd.DataFrame(encoder.fit_transform(df[['day']]).toarray() , columns = encoder.get_feature_names_out())

In [26]:
encoded_data 

Unnamed: 0,day_Fri,day_Sat,day_Sun,day_Thur
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


In [27]:
final_df = pd.concat([df , encoded_data] , axis = 1)

In [28]:
final_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_Fri,day_Sat,day_Sun,day_Thur
0,16.99,1.01,Female,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,0.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.0,1.0,0.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,0.0,0.0
