## Encoding techniques

### Label Encoding

In [1]:
## Each category is assigned a value 1 through N

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('insurance.csv')

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
data.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [6]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [10]:
data['sex'].unique()

array(['female', 'male'], dtype=object)

In [11]:
data['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [12]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

## fit -- for fitting
## transform -- for transforming
## fit_transform -- for both

data['sex']=le.fit_transform(data['sex'])

In [16]:
data['sex']

0       0
1       1
2       1
3       1
4       1
       ..
1333    1
1334    0
1335    0
1336    0
1337    0
Name: sex, Length: 1338, dtype: int32

In [17]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,yes,southwest,16884.92
1,18,1,33.8,1,no,southeast,1725.55
2,28,1,33.0,3,no,southeast,4449.46
3,33,1,22.7,0,no,northwest,21984.47
4,32,1,28.9,0,no,northwest,3866.86


In [18]:
data['smoker']=le.fit_transform(data['smoker'])

In [19]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,southwest,16884.92
1,18,1,33.8,1,0,southeast,1725.55
2,28,1,33.0,3,0,southeast,4449.46
3,33,1,22.7,0,0,northwest,21984.47
4,32,1,28.9,0,0,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,northwest,10600.55
1334,18,0,31.9,0,0,northeast,2205.98
1335,18,0,36.9,0,0,southeast,1629.83
1336,21,0,25.8,0,0,southwest,2007.95


In [20]:
data['region']=le.fit_transform(data['region'])

In [21]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,3,16884.92
1,18,1,33.8,1,0,2,1725.55
2,28,1,33.0,3,0,2,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,1,10600.55
1334,18,0,31.9,0,0,0,2205.98
1335,18,0,36.9,0,0,2,1629.83
1336,21,0,25.8,0,0,3,2007.95


## Feature Mapping

In [25]:
## LabelEncoding will not rank

# for ordinal data -- for ranked column

data1 = pd.read_csv('bigmart.csv')
print(data1['Outlet_Size'].value_counts())
data1['Outlet_Size'].fillna(data1['Outlet_Size'].mode()[0], inplace = True)
data1['Outlet_Size'] = data1['Outlet_Size'].replace(('High','Medium','Small'),(3,2,1))
data1.head()


Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,2,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,2,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,2,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,2,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,3,Tier 3,Supermarket Type1,994.7052


## One Hot Encoding

In [None]:
## maps 0 or 1
## Creates table with o and 1

In [26]:
x = pd.get_dummies(data['region'])
x.head()

Unnamed: 0,0,1,2,3
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [28]:
data_new = pd.concat([data, x], axis=1)   ## Merge it

In [29]:
data_new.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses,0,1,2,3
0,19,0,27.9,0,1,3,16884.92,0,0,0,1
1,18,1,33.8,1,0,2,1725.55,0,0,1,0
2,28,1,33.0,3,0,2,4449.46,0,0,1,0
3,33,1,22.7,0,0,1,21984.47,0,1,0,0
4,32,1,28.9,0,0,1,3866.86,0,1,0,0


In [30]:
data_new = data_new.drop(columns = ['region'])

In [31]:
data_new

Unnamed: 0,age,sex,bmi,children,smoker,expenses,0,1,2,3
0,19,0,27.9,0,1,16884.92,0,0,0,1
1,18,1,33.8,1,0,1725.55,0,0,1,0
2,28,1,33.0,3,0,4449.46,0,0,1,0
3,33,1,22.7,0,0,21984.47,0,1,0,0
4,32,1,28.9,0,0,3866.86,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,10600.55,0,1,0,0
1334,18,0,31.9,0,0,2205.98,1,0,0,0
1335,18,0,36.9,0,0,1629.83,0,0,1,0
1336,21,0,25.8,0,0,2007.95,0,0,0,1


### Binary Encoding 

In [32]:
## One hot encoding produces multiple columns
## It converts into binary bits

# for 8 columns -- only 3 columns
# for upto 128 -- only 6 columns

## baseN encoding