# Categorical Data Handling

## Encoding Nominal Categorical Feature

In [1]:
import numpy as np

In [2]:
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer

In [3]:
feature = np.array([['Delhi'],['Mumbai'],['Delhi'],['Kolkata'],['Delhi']])

In [4]:
feature

array([['Delhi'],
       ['Mumbai'],
       ['Delhi'],
       ['Kolkata'],
       ['Delhi']], dtype='<U7')

In [5]:
encoding = LabelBinarizer()

In [6]:
encoding.fit_transform(feature)

array([[1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [8]:
# checking the category classes
encoding.classes_

array(['Delhi', 'Kolkata', 'Mumbai'], dtype='<U7')

In [9]:
import pandas as pd

In [12]:
pd.get_dummies(feature[:,0])

Unnamed: 0,Delhi,Kolkata,Mumbai
0,1,0,0
1,0,0,1
2,1,0,0
3,0,1,0
4,1,0,0


In [13]:
multiclass_feature = [("Delhi","Mumbai"),("Chennai","Agra"),("Delhi","Mumbai"),("Bhubaneshwar","Mumbai"),("Delhi","Agra")]

In [14]:
multiclass_feature

[('Delhi', 'Mumbai'),
 ('Chennai', 'Agra'),
 ('Delhi', 'Mumbai'),
 ('Bhubaneshwar', 'Mumbai'),
 ('Delhi', 'Agra')]

In [15]:
encoding_multiclass = MultiLabelBinarizer()

In [16]:
encoding_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 0, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0]])

In [17]:
encoding_multiclass.classes_

array(['Agra', 'Bhubaneshwar', 'Chennai', 'Delhi', 'Mumbai'], dtype=object)

## Encoding ordinal Categorical features

In [18]:
dataframe =pd.DataFrame({"score":["Low","Low","Medium","Medium","High"]})

In [19]:
dataframe

Unnamed: 0,score
0,Low
1,Low
2,Medium
3,Medium
4,High


In [20]:
scale_mapper = {"Low":1,"Medium":2,"High":3}

In [21]:
dataframe["score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: score, dtype: int64

## Encoding a Feature Dictionary

In [22]:
from sklearn.feature_extraction import DictVectorizer

In [23]:
data_dict = [{"Wine":2,"Vodka":4},{"Wine":4,"Vodka":3},{"Wine":1,"Beer":2},{"Wine":2,"Beer":2}]

In [24]:
data_dict

[{'Wine': 2, 'Vodka': 4},
 {'Wine': 4, 'Vodka': 3},
 {'Wine': 1, 'Beer': 2},
 {'Wine': 2, 'Beer': 2}]

In [25]:
dictvectorizer = DictVectorizer(sparse=False)

In [26]:
features = dictvectorizer.fit_transform(data_dict)

In [27]:
features

array([[0., 4., 2.],
       [0., 3., 4.],
       [2., 0., 1.],
       [2., 0., 2.]])

In [28]:
feature_names = dictvectorizer.get_feature_names()

In [29]:
feature_names

['Beer', 'Vodka', 'Wine']

In [30]:
dataframe = pd.DataFrame(features,columns=feature_names)

In [31]:
dataframe

Unnamed: 0,Beer,Vodka,Wine
0,0.0,4.0,2.0
1,0.0,3.0,4.0
2,2.0,0.0,1.0
3,2.0,0.0,2.0


In [32]:
sparse_dictvectorizer = DictVectorizer(sparse=True)

In [33]:
features = sparse_dictvectorizer.fit_transform(data_dict)

In [34]:
print(features)

  (0, 1)	4.0
  (0, 2)	2.0
  (1, 1)	3.0
  (1, 2)	4.0
  (2, 0)	2.0
  (2, 2)	1.0
  (3, 0)	2.0
  (3, 2)	2.0
