<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/01_3_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer, OneHotEncoder
import numpy as np
import pandas as pd

#### MultiLabelBinarzier

In [2]:
y = [('Texas', 'Florida'), 
    ('California', 'Alabama'), 
    ('Texas', 'Florida'), 
    ('Delware', 'Florida'), 
    ('Texas', 'Alabama')]

ml_encoder = MultiLabelBinarizer()

# One-hot encode data
print(ml_encoder.fit_transform(y))

ml_encoder.classes_

[[0 0 0 1 1]
 [1 1 0 0 0]
 [0 0 0 1 1]
 [0 0 1 1 0]
 [1 0 0 0 1]]


array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

#### LabelBinarizer

In [3]:
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])
lb_encoder = LabelBinarizer()

# One-hot encode data
lb_encoder.fit_transform(x)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [5]:
lb_encoder.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

#### Panda Categorizer

In [6]:
x[:, 0]

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [7]:
pd.get_dummies(x[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


#### OneHotEncoder

In [21]:
onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(x)

<5x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [22]:
onehot_encoder.get_feature_names()

array(['x0_California', 'x0_Delaware', 'x0_Texas'], dtype=object)

In [23]:
onehot_encoder.categories_

[array(['California', 'Delaware', 'Texas'], dtype='<U10')]

In [24]:
onehot_encoder.transform(x).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [25]:
onehot_encoder.fit_transform(y)

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [26]:
onehot_encoder.transform(y).toarray()

array([[0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0.]])

In [27]:
onehot_encoder.get_feature_names()

array(['x0_California', 'x0_Delware', 'x0_Texas', 'x1_Alabama',
       'x1_Florida'], dtype=object)

### Label Encoder

In [0]:
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])

In [0]:
dataset = pd.DataFrame({'City': x[:, 0]})

In [0]:
dataset.head(10)

Unnamed: 0,City
0,Texas
1,California
2,Texas
3,Delaware
4,Texas


In [0]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit_transform(x)

array([2, 0, 2, 1, 2])

In [0]:
label_encoder.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

### DictVectorizer

Transform lists of feature-value mappings to vectors

In [0]:
from sklearn.feature_extraction import DictVectorizer

encoder = DictVectorizer(sparse=False)
dict = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
encoder.fit_transform(dict)

array([[2., 0., 1.],
       [0., 1., 3.]])

In [0]:
encoder.feature_names_

['bar', 'baz', 'foo']

In [0]:
encoder.vocabulary_

{'foo': 2, 'bar': 0, 'baz': 1}

### Scipy Category Encoder

- [article](https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159)
- [scipy category encoder package](http://contrib.scikit-learn.org/categorical-encoding/index.html)

In [0]:
x = [
        ["Not Happy"], 
        ["Moderately Happy"],
        ["Happy"],
        ["Very Happy"],
]

In [29]:
import category_encoders as ce

ModuleNotFoundError: ignored

In [0]:
binary_encoder = ce.BinaryEncoder()

In [0]:
binary_encoder.fit_transform(x)

Unnamed: 0,0_0,0_1,0_2
0,0,0,1
1,0,1,0
2,0,1,1
3,1,0,0


In [0]:
# make some data
df = pd.DataFrame({
 'color':["a", "b", "a", "c"], 
 'outcome':[1, 2, 3, 2]})

# split into X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)


In [0]:
ce_binary = ce.BinaryEncoder(cols = ['color'])
ce_binary.fit_transform(X, y)

Unnamed: 0,color_0,color_1,color_2
0,0,0,1
1,0,1,0
2,0,0,1
3,0,1,1


In [0]:
ce_ordinal = ce.OrdinalEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color
0,1
1,2
2,1
3,3


In [0]:
ce_ordinal = ce.OneHotEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color_1,color_2,color_3
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1


In [0]:
ce_ordinal = ce.HashingEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0


In [0]:
ce_ordinal = ce.TargetEncoder(cols = ['color'])
ce_ordinal.fit_transform(X,y)

Unnamed: 0,color
0,2.0
1,2.0
2,2.0
3,2.0
