In [2]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
import numpy as np
import pandas as pd

#### Multi labels

In [14]:
y = [('Texas', 'Florida'), 
    ('California', 'Alabama'), 
    ('Texas', 'Florida'), 
    ('Delware', 'Florida'), 
    ('Texas', 'Alabama')]

one_hot = MultiLabelBinarizer()

# One-hot encode data
print(one_hot.fit_transform(y))

one_hot.classes_

[[0 0 0 1 1]
 [1 1 0 0 0]
 [0 0 0 1 1]
 [0 0 1 1 0]
 [1 0 0 0 1]]


array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

#### One hot

In [15]:
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])
one_hot = LabelBinarizer()

# One-hot encode data
one_hot.fit_transform(x)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [16]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [17]:
x[:, 0]

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [18]:
pd.get_dummies(x[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


### Label Encoder

In [18]:
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])

In [19]:
dataset = pd.DataFrame({'City': x[:, 0]})

In [20]:
dataset.head(10)

Unnamed: 0,City
0,Texas
1,California
2,Texas
3,Delaware
4,Texas


In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit_transform(x)

array([2, 0, 2, 1, 2])

In [24]:
label_encoder.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

### DictVectorizer

Transform lists of feature-value mappings to vectors

In [26]:
from sklearn.feature_extraction import DictVectorizer

encoder = DictVectorizer(sparse=False)
dict = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
encoder.fit_transform(dict)

array([[2., 0., 1.],
       [0., 1., 3.]])

In [27]:
encoder.feature_names_

['bar', 'baz', 'foo']

In [28]:
encoder.vocabulary_

{'foo': 2, 'bar': 0, 'baz': 1}

### Scipy Category Encoder

- [article](https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159)
- [scipy category encoder package](http://contrib.scikit-learn.org/categorical-encoding/index.html)

In [32]:
x = [
        ["Not Happy"], 
        ["Moderately Happy"],
        ["Happy"],
        ["Very Happy"],
]

In [33]:
import category_encoders as ce

In [34]:
binary_encoder = ce.BinaryEncoder()

In [35]:
binary_encoder.fit_transform(x)

Unnamed: 0,0_0,0_1,0_2
0,0,0,1
1,0,1,0
2,0,1,1
3,1,0,0


In [36]:
# make some data
df = pd.DataFrame({
 'color':["a", "b", "a", "c"], 
 'outcome':[1, 2, 3, 2]})

# split into X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)


In [38]:
ce_binary = ce.BinaryEncoder(cols = ['color'])
ce_binary.fit_transform(X, y)

Unnamed: 0,color_0,color_1,color_2
0,0,0,1
1,0,1,0
2,0,0,1
3,0,1,1


In [39]:
ce_ordinal = ce.OrdinalEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color
0,1
1,2
2,1
3,3


In [40]:
ce_ordinal = ce.OneHotEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color_1,color_2,color_3
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1


In [41]:
ce_ordinal = ce.HashingEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0


In [42]:
ce_ordinal = ce.TargetEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color
0,2.0
1,2.0
2,2.0
3,2.0
