# Label Encoding - Big Mart Dataset
#### - Mohan Bakshi

In [1]:
# Importing libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [2]:
# Reading the cleaned bigmart dataset
data = pd.read_csv('Datasets/cleaned_bigmart.csv')

# Printing first 5 rows
data.head(5)

Unnamed: 0.1,Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
# verifying data 
data.isnull().sum()

Unnamed: 0                   0
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
Item_Fat_Content_enc         0
Item_Type_enc                0
Outlet_Size_enc              0
Outlet_Type_enc              0
Outlet_Size_fenc             0
dtype: int64

### Label Encoding

In [4]:
le = LabelEncoder()

data['Item_Fat_Content_enc'] = le.fit_transform(data['Item_Fat_Content'])
data['Item_Type_enc'] = le.fit_transform(data['Item_Type'])
data['Outlet_Size_enc'] = le.fit_transform(data['Outlet_Size'])
data['Outlet_Type_enc'] = le.fit_transform(data['Outlet_Type'])

data.head()

Unnamed: 0.1,Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Fat_Content_enc,Item_Type_enc,Outlet_Size_enc,Outlet_Type_enc
0,0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,0,4,1,1
1,1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,1,14,1,2
2,2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,0,10,1,1
3,3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,1,6,1,0
4,4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,0,9,0,1


### Feature Mapping

In order to illustrate Feature Mapping, we use Outlet_Size as it has 3 unique values i.e HIGH, SMALL, and MEDIUM.

In [7]:
data['Outlet_Size'].value_counts()

Outlet_Size
Medium    5141
Small     2316
High       932
Name: count, dtype: int64

In [8]:
data['Outlet_Size_fenc'] = data['Outlet_Size'].replace(('High', 'Medium', 'Small'),(3,2,1))

data['Outlet_Size_fenc'].value_counts()

  data['Outlet_Size_fenc'] = data['Outlet_Size'].replace(('High', 'Medium', 'Small'),(3,2,1))


Outlet_Size_fenc
2    5141
1    2316
3     932
Name: count, dtype: int64

### One Hot Encoding

In [10]:
x = pd.get_dummies(data['Outlet_Type'])

In [11]:
x

Unnamed: 0,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,False,True,False,False
1,False,False,True,False
2,False,True,False,False
3,True,False,False,False
4,False,True,False,False
...,...,...,...,...
8384,False,True,False,False
8385,False,True,False,False
8386,False,True,False,False
8387,False,False,True,False


In [12]:
data = pd.concat([data,x], axis=1)

In [13]:
data.head()

Unnamed: 0.1,Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,...,Item_Outlet_Sales,Item_Fat_Content_enc,Item_Type_enc,Outlet_Size_enc,Outlet_Type_enc,Outlet_Size_fenc,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,...,3735.138,0,4,1,1,2,False,True,False,False
1,1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,...,443.4228,1,14,1,2,2,False,False,True,False
2,2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,...,2097.27,0,10,1,1,2,False,True,False,False
3,3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,...,732.38,1,6,1,0,2,True,False,False,False
4,4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,...,994.7052,0,9,0,1,3,False,True,False,False


In [15]:
data = data.drop('Outlet_Type', axis=1)


#### For better understanding, How Data Encoding Techniques works; Check out this article: https://medium.com/@mohan_bakshi/data-encoding-simplified-techniques-for-every-beginner-279e37f0c9dc