# sklearn.feature_extraction

## DictVectorizer
Implement one-of-K or one-hot coding for categorical features

In [1]:
org_data = [{'name': 'Ann', 'age': 30},
           {'name': 'Dave', 'age': 32},
           {'name': 'Kavin', 'age': 31}]

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

print('New data\n', vec.fit_transform(org_data).toarray())

print('\nFeatures\n', vec.get_feature_names())

New data
 [[30.  1.  0.  0.]
 [32.  0.  1.  0.]
 [31.  0.  0.  1.]]

Features
 ['age', 'name=Ann', 'name=Dave', 'name=Kavin']


## Text feature extraction

### CountVectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corprus = ['I see a cat.',
           'It is yellow and brown.',
           'His house is cozy and beautiful.'
          ]

vec = CountVectorizer()

X = vec.fit_transform(corprus)

# print(X)

print(vec.get_feature_names())

print(X.toarray())

['and', 'beautiful', 'brown', 'cat', 'cozy', 'his', 'house', 'is', 'it', 'see', 'yellow']
[[0 0 0 1 0 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 1 0 1]
 [1 1 0 0 1 1 1 1 0 0 0]]


## TfidfVectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

corprus = ['I see a cat.',
           'It is yellow and brown.',
           'His house is cozy and beautiful.'
          ]

vec = TfidfVectorizer()

X = vec.fit_transform(corprus)

# print(X)

print(vec.get_feature_names())

print(X.toarray())

['and', 'beautiful', 'brown', 'cat', 'cozy', 'his', 'house', 'is', 'it', 'see', 'yellow']
[[0.         0.         0.         0.70710678 0.         0.
  0.         0.         0.         0.70710678 0.        ]
 [0.37302199 0.         0.49047908 0.         0.         0.
  0.         0.37302199 0.49047908 0.         0.49047908]
 [0.3349067  0.44036207 0.         0.         0.44036207 0.44036207
  0.44036207 0.3349067  0.         0.         0.        ]]


# sklearn.preprocessing

## Standardization
Gaussian with zero mean and unit variance

### scale
Standardize a dataset along any axis.

scale()

minmax_scale()

maxabs_scale()

robust_scale()


In [4]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

print(X_train.mean(axis=0))

print(X_train.std(axis=0))

print(X_scaled.mean(axis=0))

print(X_scaled.std(axis=0))

[1.         0.         0.33333333]
[0.81649658 0.81649658 1.24721913]
[0. 0. 0.]
[1. 1. 1.]


### StandardScaler
Standardize features by removing the mean and scaling to unit variance.

Note: It is possible to disable either centering or scaling by either passing with_mean=False or with_std=False

In [5]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X_train = np.random.rand(12).reshape(3,4)

scaler = StandardScaler()

print('Contructor: ', scaler)

scaler.fit(X_train)

print('\n Mean: \n', scaler.mean_)
print('\n Scale: \n', scaler.scale_)

print('\n Scaler.transform:\n', scaler.transform(X_train))


Contructor:  StandardScaler(copy=True, with_mean=True, with_std=True)

 Mean: 
 [0.47822047 0.50407115 0.67461575 0.42355057]

 Scale: 
 [0.29220594 0.25149107 0.19829195 0.38773293]

 Scaler.transform:
 [[ 1.34269301 -1.17613446 -1.24955992  1.36467655]
 [-1.0559003  -0.09202895  0.05123834 -1.00365343]
 [-0.28679271  1.2681634   1.19832159 -0.36102312]]


### MinMaxScaler
Transform features by scaling each feature to a given range, [0, 1] by default

In [6]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

X_train = np.random.rand(12).reshape(3,4)

scaler = MinMaxScaler(feature_range=(0, 1))

print('\n Constructor: ', scaler)

scaler.fit(X_train)

print('\n Transform: \n', scaler.transform(X_train))


 Constructor:  MinMaxScaler(copy=True, feature_range=(0, 1))

 Transform: 
 [[0.         1.         0.         0.88879126]
 [1.         0.         1.         0.        ]
 [0.60075368 0.30165653 0.44165936 1.        ]]


### MaxAbsScaler
Transform each feature by its maximum absolute value. Outputs are in the fixed range [-1, 1]

In [7]:
from sklearn.preprocessing import MaxAbsScaler
import numpy as np

X_train = np.random.rand(12).reshape(3, 4)

scaler = MaxAbsScaler()

print('\n Constructor: ', scaler)

scaler.fit(X_train)

print('\n Transform: \n', scaler.transform(X_train))



 Constructor:  MaxAbsScaler(copy=True)

 Transform: 
 [[1.         0.35022627 1.         0.02544788]
 [0.43130671 1.         0.99441364 1.        ]
 [0.62889325 0.32092059 0.24544743 0.56194918]]


### RobustScaler
Transform features using statistics that are robust to outliers.

In [8]:
from sklearn.preprocessing import RobustScaler
X_train = [[ 1., -2.,  2.],
     [ -2.,  1.,  3.],
     [ 4.,  1., -2.]]
scaler = RobustScaler()

print('\n Constructor:\n', scaler)

scaler.fit(X_train)

print('\n Transform: \n', scaler.transform(X_train))


 Constructor:
 RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

 Transform: 
 [[ 0.  -2.   0. ]
 [-1.   0.   0.4]
 [ 1.   0.  -1.6]]


### Note
- MinMaxScaler is recommended for sparse data.

- scale() and StandardScaler can support well sparse data as long as *with_mean=False*

- If data contrains many outliers, robust_scale() and RobustScaler are recommended.

## Non-linear transformation

### QuantileTransformer
- Map to a Uniform distribution.
- Spread out the most frequent values
- Reduce the impact of outliers

In [9]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

X, y = iris.data, iris.target

quantile_transformer = QuantileTransformer(random_state=0)

X_t = quantile_transformer.fit_transform(X)

print(X_t[:5])


[[0.24174174 0.85585586 0.11411411 0.12762763]
 [0.12412412 0.46646647 0.11411411 0.12762763]
 [0.06406406 0.67117117 0.04704705 0.12762763]
 [0.04354354 0.59059059 0.2012012  0.12762763]
 [0.17767768 0.88938939 0.11411411 0.12762763]]


### PowerTransformer
- Make data more Gausian-like

- Be useful for modeling issues related to heteroscedasticity (non-constant variance)

- Algorithms:

  Box-Cox can only be applied to strictly positive data.

  Yeo-Johnson supports both positive or negative data (Default)

In [10]:
from sklearn.preprocessing import PowerTransformer
import numpy as np

X = np.random.rand(12).reshape(3,4)

print('\n Orginial data:\n', X)

power_transformer = PowerTransformer()

X_t = power_transformer.fit_transform(X)

print('\n Transform:\n', X_t)


 Orginial data:
 [[0.24230513 0.68013085 0.3199994  0.45674358]
 [0.02086126 0.86925737 0.67650581 0.74300248]
 [0.27964695 0.79594602 0.24944486 0.20798391]]

 Transform:
 [[ 0.38510158 -1.26549469 -0.28691561  0.00699068]
 [-1.37101275  1.17945787  1.34273241  1.22123457]
 [ 0.98591117  0.08603682 -1.0558168  -1.22822525]]


## Normalization
The process of scaling individual samples to have unit norm.

normalize()

Normalizer

In [11]:
from sklearn.preprocessing import normalize, Normalizer

X_train = [[ 1., -2.,  2.],
     [ -2.,  1.,  3.],
     [ 4.,  1., -2.]]

X_normalize = normalize(X, norm='l2')

print('\n normalize(): ', X_normalize)

normalizer = Normalizer()
X_n = normalizer.fit_transform(X)

print('\n Normalizer: ', X_n)



 normalize():  [[0.26559604 0.74550656 0.35075846 0.5006468 ]
 [0.01569911 0.6541585  0.50910357 0.55914555]
 [0.30934462 0.88047313 0.27593517 0.23007119]]

 Normalizer:  [[0.26559604 0.74550656 0.35075846 0.5006468 ]
 [0.01569911 0.6541585  0.50910357 0.55914555]
 [0.30934462 0.88047313 0.27593517 0.23007119]]


## Encode categorical features

### OrdinalEncoder

In [12]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

X_enc = enc.transform(X)
print(X_enc)

[[1. 1. 1.]
 [0. 0. 0.]]


### OneHotEncoder

In [13]:

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

X_enc = enc.transform(X).toarray()
print(X_enc)

print('\n Categories: \n', enc.categories_)

[[0. 1. 0. 1. 0. 1.]
 [1. 0. 1. 0. 1. 0.]]

 Categories: 
 [array(['female', 'male'], dtype=object), array(['from Europe', 'from US'], dtype=object), array(['uses Firefox', 'uses Safari'], dtype=object)]


## Discretization
Divide continuous features into discrete values.

### KBinsDiscretizer

In [14]:
from sklearn.preprocessing import KBinsDiscretizer
X = np.array([[ -3., 5., 15 ],
              [  0., 6., 14 ],
              [  6., 3., 11 ]])

enc = KBinsDiscretizer(n_bins=[3, 2, 2], encode='ordinal').fit(X)

print(enc.transform(X))


[[0. 1. 1.]
 [1. 1. 1.]
 [2. 0. 0.]]


### Binarizer

In [15]:
from sklearn.preprocessing import Binarizer

X = [[ 1., -2.,  2.],
     [ -2.,  1.,  3.],
     [ 4.,  1., -2.]]

bin = Binarizer().fit(X)

print(bin.transform(X))

[[1. 0. 1.]
 [0. 1. 1.]
 [1. 1. 0.]]
