In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


import pandas as pd
import numpy as np
from numpy import nan
import seaborn as sns

**sklearn.preprocessing** package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for learning alg.

preprocessing usually involves: 
1. changing the representation of the data (i.e from categorial to numerical)
2. imputation of missing data
3. discretization
4. feature scaling

before the preprocessing it is better to separate the predictors and the labels. 
we dont necessarily want to apply the same transformations to both of them

## Handling Categorial Attributes - LabelEncoder & OneHotEncoder

one issue with this represenation is that ML alg. will assume that two nearby values are more similar than two distant values

to fix this issue, a common solution is to create one binary attibute per category, called **one-hot encoding**
i.e: one attribute equal to 1 when is "blue" (and 0 otherwise), another attribute equal to 1 when is "green" (and 0 otherwise) and so on...

note that fit_transform() of OneHotEncoder expects a 2D array, so we first need to reshape out columns into 2D arrays
the output is a Scipy *sparse matrix*, instead of numpy array.

after onehot encoding we get a matrix with thousands of columms and the matrix is full of zeros except for one 1 per row
so instead of a sparse matrix only stores the location of the nonzero elements

many (not all) estimators accept such sparse matrices

you can use sparse matricses mostly like a normal 2D arraym but if you really want to convert to a dense numpy array call the **toarray()** method or to call **OneHotEncoder(sparse=False)**

if there is a possibility that the training data might have missing categorial features, it can often be better to specify
**handle_unknown='ignore'**. no error will be raised but the resulting columns for this features will be all zeros

@see housing for LabelEncoder + OneHotEncoder

In [3]:
df = pd.DataFrame([
    {'color':'green','make': 'Checrolet','year':2017},
    {'color':'blue','make': 'BMW','year':2015},
    {'color':'yellow','make': 'Lexus','year':2018}
])
df

Unnamed: 0,color,make,year
0,green,Checrolet,2017
1,blue,BMW,2015
2,yellow,Lexus,2018


In [57]:
encoder = LabelEncoder()

df['color_encoded'] = encoder.fit_transform(df['color'])
df['make_encoded'] = encoder.fit_transform(df['make'])

df
df['color_encoded'].values

array([1, 0, 2], dtype=int64)

scikit 0.19 version OneHotEncoding needed numerical value first (earlier we couldn't directly encode string type data to numerical using OneHotEncoding, so first we used to apply LabelEncoding first) and then we used to apply OneHotEncoding.

But Now (0.22) , OneHotEncoding could directly work with String data types also, but here we need data to be of type -either a DATAFRAME or a 2D Array. 


In [5]:
encoder1 = OneHotEncoder()
# 2 options to reshape:
    #df['color'].values[:,np.newaxis]
    #df['color'].values.reshape(-1,1)

#for old version 0.19
#color_1hot = encoder1.fit_transform(df['color_encoded'].values.reshape(-1,1))
#make_1hot = encoder1.fit_transform(df['make_encoded'].values.reshape(-1,1))

#color_1hot

color_1hot = encoder1.fit_transform(df['color'].values.reshape(-1,1))
make_1hot = encoder1.fit_transform(df['make'].values.reshape(-1,1))


In [6]:
color_1hot.toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [7]:
encoder1 = OneHotEncoder(sparse=False)
# 2 options to reshape:
    #df['color'].values[:,np.newaxis]
    #df['color'].values.reshape(-1,1)

color_1hot = encoder1.fit_transform(df['color'].values.reshape(-1,1))
make_1hot = encoder1.fit_transform(df['make'].values.reshape(-1,1))

color_1hot

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

## imputation of missing data - SimpleImputer


most ML alg. cannot work with missing features. thus we need ti first replace missing data with some appropriate fill value

SimpleImputer strategy - mean,median,constant,most_frequent

the advantage of using imputer instead of filling NA values with pandas is the you can use the same imputer to replace missing values in the test set, and also once the system goes live 

imputing does not always improve prediction, we need to check via cross validation. sometime dropping rows or using marker values is more effective

@see housing 

In [8]:
import sklearn 
print (sklearn.__version__)

0.22.1


In [24]:
X = np.array([[nan,0,3],
             [3,7,9],
             [3,5,2],
             [4,nan,6],
             [8,8,1]])

imputer = SimpleImputer(strategy = 'median')
X = imputer.fit_transform(X)
display(X)

X = np.array([[nan,0,3],
             [3,7,9],
             [3,5,2],
             [4,nan,6],
             [8,8,1]])

imputer_mean = SimpleImputer(strategy = 'mean')
X2 = imputer_mean.fit_transform(X)
display(X2)

data = np.array(['a','b','a',np.nan],dtype = object).reshape(-1,1)
imputer = SimpleImputer(strategy = 'most_frequent')
data = imputer.fit_transform(data)
display(data)

data = np.array(['a','b','a',np.nan],dtype = object).reshape(-1,1)
imputer = SimpleImputer(strategy = 'constant',fill_value='b')
data = imputer.fit_transform(data)
display(data)

array([[3.5, 0. , 3. ],
       [3. , 7. , 9. ],
       [3. , 5. , 2. ],
       [4. , 6. , 6. ],
       [8. , 8. , 1. ]])

array([[4.5, 0. , 3. ],
       [3. , 7. , 9. ],
       [3. , 5. , 2. ],
       [4. , 5. , 6. ],
       [8. , 8. , 1. ]])

array([['a'],
       ['b'],
       ['a'],
       ['a']], dtype=object)

array([['a'],
       ['b'],
       ['a'],
       ['b']], dtype=object)

## Discretization - KBinsDiscretizer

discretization (binning or quantization) provides a way to partition continuous features into discrete value

**KBinsDiscretizer strategy** : {'uniform', 'quantile', 'kmeans'}, (default='quantile')

uniform :  all bins in each feature have identical widths
    
quantile : all bins in each feature have the same number of points.(the bins not always equal in this option)
    
kmeans : values in each bin have the same nearest center of a 1D k-means cluster.


**KBinsDiscretizer encode** : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')

onehot : Encode the transformed result with one-hot encoding and return a sparse matrix. Ignored features are always
        stacked to the right.
        
onehot-dense: Encode the transformed result with one-hot encoding and return a dense array. Ignored features are always
        stacked to the right.
        
ordinal: Return the bin identifier encoded as an integer value.


In [32]:
X = [[-2,1,-1,-1],
     [-1,2,-3,2],
    [0,1,-2,0.5],
    [1,4,-3,-0.5]]

est = KBinsDiscretizer(n_bins=3,encode='onehot-dense')
est.fit_transform(X)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


array([[1., 0., 0., 1., 0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1., 1., 0., 0., 1., 0.]])

In [55]:
X = [[-2,1,-1,-1],
     [-1,2,-3,2],
     [0 ,1,-2,0.5],
     [1 ,4,-3,-0.5]]

est = KBinsDiscretizer(n_bins=3,encode='ordinal')
x_trasformed = est.fit_transform(X)
est.bin_edges_,x_trasformed

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


(array([array([-2., -1.,  0.,  1.]), array([1., 2., 4.]),
        array([-3., -2., -1.]), array([-1. , -0.5,  0.5,  2. ])],
       dtype=object), array([[0., 0., 1., 0.],
        [1., 1., 0., 2.],
        [2., 0., 1., 2.],
        [2., 1., 0., 1.]]))

In [56]:
X = [[-2,1,-1,-1],
     [-1,2,-3,2],
     [0 ,1,-2,0.5],
     [1 ,4,-3,-0.5]]

est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
x_trasformed = est.fit_transform(X)
est.bin_edges_,x_trasformed

(array([array([-2., -1.,  0.,  1.]), array([1., 2., 3., 4.]),
        array([-3.        , -2.33333333, -1.66666667, -1.        ]),
        array([-1.,  0.,  1.,  2.])], dtype=object), array([[0., 0., 2., 0.],
        [1., 1., 0., 2.],
        [2., 0., 1., 1.],
        [2., 2., 0., 0.]]))

In [52]:
X = [[1],[2],[3],[4],[5]]

est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
x_trasformed = est.fit_transform(X)
est.bin_edges_,x_trasformed

(array([array([1.        , 2.33333333, 3.66666667, 5.        ])],
       dtype=object), array([[0.],
        [0.],
        [1.],
        [2.],
        [2.]]))

In [54]:
X = [[1],[2],[3],[4],[5],[8]]

est = KBinsDiscretizer(n_bins=3,encode='ordinal')
x_trasformed = est.fit_transform(X)
est.bin_edges_,x_trasformed

(array([array([1.        , 2.66666667, 4.33333333, 8.        ])],
       dtype=object), array([[0.],
        [0.],
        [1.],
        [1.],
        [2.],
        [2.]]))

## Feature Scaling - MInMaxScaler and StandardScaler


ML alg. do not perform well when the input numerical attributes have very different scales
there are 2 common ways to get all attributes to have the same scale:
**min-max and standardization**

**MInMaxScaler** values are shifted and rescaled so that they end up ranging from 0 to 1
we do this by **(x - min value) / (max value - min value)**
the **featue_range** hyper-parameter lets you change the range if you do not want 0-1

**StandardScaler**   **(x - mean value) / variance**
unlike minmax , standard does not bound values to a specific range. which may be a problem for some alg., which expect an input value in range of 0-1
standardization is much less affected by outliers
the resulting distribution in standardization has zero mean and unit variance

@see housing

In [61]:
data = [[-1,2],
        [-0.5,6],
        [0,10],
        [1,18]]
scaler = MinMaxScaler()
scaler.fit_transform(data)

#(-1 - (-1)/ (1- (-1)) = 0 / 2 = 0
#(-0.5 - (-1)) / 2 = 0.5/2=0.25

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [64]:
data = [[-1,2],
        [-0.5,6],
        [0,10],
        [1,18]]
scaler = MinMaxScaler(feature_range=[0,5])
scaler.fit_transform(data)

array([[0.  , 0.  ],
       [1.25, 1.25],
       [2.5 , 2.5 ],
       [5.  , 5.  ]])

In [65]:
data = [[-1,2],
        [-0.5,6],
        [0,10],
        [1,18]]
scaler = StandardScaler()
scaler.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

# Scikit-Learn's Column Transformer

applies transformers to columns of an array or pandas DataFrame
This estimator allows different columns or column subsets of the input to be transformed separately and the results combined into a single feature space

This is a particularly handy for datasets that contain heterogeneous data types, since we may want to scale the numeric features and one hot encode the categorial ones

The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the transformers list

Columns of the original feature matrix that are not specified are dropped from the resulting transformed feature matrixm unless specified in the **passthrough** keyword

Those columns specified with passthrough are added at the right to the output of the transformers

in classifier, the **score()** method returns the mean accuracy on the given test data

@see housing

In [12]:
ct = ColumnTransformer([('scaler',StandardScaler(),[0,1]),
                       ('imputer',SimpleImputer(),slice(3,5))])


#a scaling is applied for the 2 first elements and an imputer is applied for the 2 last elements of each row
X = np.array([[0,1,1,np.nan,2],
        [1,1,0,0,np.nan]],dtype=float)
display(X)

X = ct.fit_transform(X)
display(X)

array([[ 0.,  1.,  1., nan,  2.],
       [ 1.,  1.,  0.,  0., nan]])

array([[-1.,  0.,  0.,  2.],
       [ 1.,  0.,  0.,  2.]])

In [29]:
#get the titanic dataset
titanic = sns.load_dataset('titanic')
display(titanic.head())

#create the preprocessing pipeline for the numerical data
numeric_features = ['age','fare']
numeric_transformer = Pipeline([('imputer',SimpleImputer(strategy='median')),
                               ('scaler',StandardScaler())])

#create the preprocessing pipeline for the categorial data
category_features = ['embarked','sex','pclass']
category_transformer = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                               ('onehot',OneHotEncoder())])

preprocessor = ColumnTransformer([
    ('num',numeric_transformer,numeric_features),
    ('cat',category_transformer,category_features)
])

clf = Pipeline([
    ('pre',preprocessor),
    ('cls',RandomForestClassifier(n_estimators=20))
])

X = titanic.drop('survived',axis=1)
y = titanic['survived']

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size=0.2)
X_train,X_test ,y_train ,y_test

clf.fit(X_train,y_train)
print(np.round(clf.score(X_test,y_test),3))



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


0.832


# Scikit-Learn's Custom Transformer

although scikit learn provides many useful transformers m you will need ti write your own for tasks such as custom cleanup operation or combining specific attributes

you will want your transformer to work seamlessly with scikit learn functionalities
you need to implements three methods:
fit() - return itself
transform()
fit_transform()

you can get it for free by adding **TransormerMixin** as a base class
if you add **BaseEstimator** as a base class you will get 2 extra methods **get_params() and set_params()** that are useful for hyperparameter tuning

@see housing
