In [1]:
import pandas as pd
import numpy as np

# Encoding Nominal Categorical Features

In [2]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

## Creating a feature
X = np.array([['Texas'], ['California'], ['Texas'], ['Delaware'], ['Texas']])

## Creating one-hot encoder
one_hot = LabelBinarizer()

## Encoding features
one_hot.fit_transform(X)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [3]:
## We can also use pandas to create one-hot encode 
pd.get_dummies(X[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [4]:
## Creating a multi-class feature
Y = [('Texas', 'Florida'), ('California', 'Alabama'), ('Texas', 'Florida'), ('Delware', 'Florida'), ('Texas', 'Alabama')]

## Creating multi-class encoder
one_hot_multi = MultiLabelBinarizer()

## Encoding features
one_hot_multi.fit_transform(Y)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

# Encoding Ordinal Categorical Features

In [5]:
## Creating a data-frame
data = pd.DataFrame({'Score': ['Low', 'Low', 'High', 'Medium', 'Low', 'High']})

## Creating the mapper
mapper =  {'Low': 0, 'Medium': 1, 'High': 2}

## Replacing labels with mapper
data['Score'].replace(mapper)

0    0
1    0
2    2
3    1
4    0
5    2
Name: Score, dtype: int64

# Encoding Dictionary of Features

In [6]:
from sklearn.feature_extraction import DictVectorizer

## Creating a dictionary 
data_dict = [{'Red': 2, 'Blue': 4}, {'Red': 4, 'Blue': 3}, {'Red': 1, 'Yellow': 2}, {'Red': 2, 'Yellow': 2}]

## Create a dictonary vectorizer
dictvectorizer = DictVectorizer(sparse = False)

## Converting dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [7]:
## Adding feature names
feature_names = dictvectorizer.get_feature_names()

## Creating data-frame
pd.DataFrame(features, columns = feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0
