# Ordinal Encoding

In [80]:
# ordinal encode the breast cancer dataset
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

## E.x 1 (OrdinalEncoder)

In [1]:
# example of a ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = encoder.fit_transform(data)
print(result)

[['red']
 ['green']
 ['blue']]
[[2.]
 [1.]
 [0.]]


## E.x 2 (OrdinalEncoder)

In [109]:
df = pd.read_csv('/content/test.txt', sep='\t')
df

Unnamed: 0,No,Name,Sex,Blood,Grade,Height,Study,Success,MultiLabel
0,1,Tom,M,O,56,160,Math,yes,High
1,2,Harry,M,A,76,192,Math,no,Low
2,3,John,M,A,45,178,English,no,Medium
3,4,Nancy,F,B,78,157,Biology,yes,High
4,5,Mike,M,O,79,167,Math,no,Low
5,6,Kate,F,AB,66,156,English,yes,Medium
6,7,Mary,F,O,99,166,Science,yes,High


In [56]:
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
X = ordinal_encoder.fit_transform(df[['Sex', 'Blood']])
print(X)
print()
# return the original categories
print(ordinal_encoder.categories_)
print()
# inverse the transform
print(ordinal_encoder.inverse_transform(X))

[[1. 3.]
 [1. 0.]
 [1. 0.]
 [0. 2.]
 [1. 3.]
 [0. 1.]
 [0. 3.]]

[array(['F', 'M'], dtype=object), array(['A', 'AB', 'B', 'O'], dtype=object)]

[['M' 'O']
 ['M' 'A']
 ['M' 'A']
 ['F' 'B']
 ['M' 'O']
 ['F' 'AB']
 ['F' 'O']]


We can see that the sex become 0,1 and blood 0,1,2,3 



## E.x 3 (LabelEncoder)

In [78]:
# LabelEncoder Encode target labels with value between 0 and n_classes-1.
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Success'])
print(y)
print(label_encoder.classes_)
print(label_encoder.inverse_transform(y))

[1 0 0 1 0 1 1]
['no' 'yes']
['yes' 'no' 'no' 'yes' 'no' 'yes' 'yes']


## E.x 4(LabelBinarizer) One vs All

In [83]:
label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(df['MultiLabel'])
print(y)
print()
print(label_binarizer.classes_)
print()
print(label_binarizer.inverse_transform(y))

[[1 0 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]]

['High' 'Low' 'Medium']

['High' 'Low' 'Medium' 'High' 'Low' 'Medium' 'High']


## E.x 5(MultiLabelBinarizer) All vs All

In [101]:
import numpy as np

In [113]:
multilabel_binarizer = MultiLabelBinarizer()
Y = df.iloc[:, -2:]
Y = np.array(Y)
y = multilabel_binarizer.fit_transform(Y)
print(y)
print()
print(multilabel_binarizer.classes_)
print()
print(multilabel_binarizer.inverse_transform(y))

[[1 0 0 0 1]
 [0 1 0 1 0]
 [0 0 1 1 0]
 [1 0 0 0 1]
 [0 1 0 1 0]
 [0 0 1 0 1]
 [1 0 0 0 1]]

['High' 'Low' 'Medium' 'no' 'yes']

[('High', 'yes'), ('Low', 'no'), ('Medium', 'no'), ('High', 'yes'), ('Low', 'no'), ('Medium', 'yes'), ('High', 'yes')]


# One-Hot Encoding

In [61]:
from sklearn.preprocessing import OneHotEncoder

## E.x 1

In [2]:
from numpy import asarray
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define one hot encoding
encoder = OneHotEncoder(sparse=False)
# transform data
onehot = encoder.fit_transform(data)
print(onehot)

[['red']
 ['green']
 ['blue']]
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


## E.x 2

In [62]:
df

Unnamed: 0,No,Name,Sex,Blood,Grade,Height,Study,Success
0,1,Tom,M,O,56,160,Math,yes
1,2,Harry,M,A,76,192,Math,no
2,3,John,M,A,45,178,English,no
3,4,Nancy,F,B,78,157,Biology,yes
4,5,Mike,M,O,79,167,Math,no
5,6,Kate,F,AB,66,156,English,yes
6,7,Mary,F,O,99,166,Science,yes


In [66]:
encoder = OneHotEncoder(sparse=False)
X = encoder.fit_transform(df[['Blood']])
print(X)
print()
print(encoder.categories_)
print()
print(encoder.inverse_transform(X))

[[0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]

[array(['A', 'AB', 'B', 'O'], dtype=object)]

[['O']
 ['A']
 ['A']
 ['B']
 ['O']
 ['AB']
 ['O']]


## E.x 3

In [73]:
pd.get_dummies(df['Blood'],)

Unnamed: 0,A,AB,B,O
0,0,0,0,1
1,1,0,0,0
2,1,0,0,0
3,0,0,1,0
4,0,0,0,1
5,0,1,0,0
6,0,0,0,1


# Dummy variable encoding

In [67]:
from sklearn.preprocessing import OneHotEncoder

## E.x 1

In [3]:
# example of a dummy variable encoding
from numpy import asarray
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define one hot encoding
encoder = OneHotEncoder(drop='first', sparse=False)
# transform data
onehot = encoder.fit_transform(data)
print(onehot)

[['red']
 ['green']
 ['blue']]
[[0. 1.]
 [1. 0.]
 [0. 0.]]


## E.x 2

In [68]:
df

Unnamed: 0,No,Name,Sex,Blood,Grade,Height,Study,Success
0,1,Tom,M,O,56,160,Math,yes
1,2,Harry,M,A,76,192,Math,no
2,3,John,M,A,45,178,English,no
3,4,Nancy,F,B,78,157,Biology,yes
4,5,Mike,M,O,79,167,Math,no
5,6,Kate,F,AB,66,156,English,yes
6,7,Mary,F,O,99,166,Science,yes


In [70]:
encoder = OneHotEncoder(drop='first', sparse=False)
X = encoder.fit_transform(df[['Blood']])
print(X)
print()
print(encoder.categories_)
print()
print(encoder.inverse_transform(X))

[[0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]

[array(['A', 'AB', 'B', 'O'], dtype=object)]

[['O']
 ['A']
 ['A']
 ['B']
 ['O']
 ['AB']
 ['O']]


## E.x 3

In [72]:
pd.get_dummies(df['Blood'], drop_first=True)

Unnamed: 0,AB,B,O
0,0,0,1
1,0,0,0
2,0,0,0
3,0,1,0
4,0,0,1
5,1,0,0
6,0,0,1


## E.x 4

In [None]:
pd.get_dummies(x)
for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, 
                                                                 prefix_sep='_', drop_first=True, 
                                                                 dummy_na=True)], axis=1)
        except:
            continue
    return df

''' 
dummy_na: Add a column to indicate NaNs, if False NaNs are ignored.	

'''