# Machine Learning Preprocessing Workshop

### 1. Imports

In [43]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor

from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split;

### 2. Load the data

| variable     | type       | description                   |
|--------------|------------|-------------------------------|
| mpg          | continuous | miles per gallon              |
| cylinders    | discrete   | number of cylinders of engine |
| displacement | continuous | engine volume                 |
| horsepower   | continuous | obvious                       |
| weight       | continuous | weight in kilos               |
| acceleration | continuous | obvious                       |
| model_year   | discrete   | obvious                       |
| origin       | discrete   | 1. US, 2. EU, 3. ASIA         |
| name         | string     | unique name of the automobile |

In [51]:
DATA_PATH = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
COL_NAMES = ['mpg', 
             'cylinders',
             'displacement',
             'horsepower',
             'weight',
             'acceleration',
             'model_year',
             'origin',
             'name']

auto_data = pd.read_csv(DATA_PATH, delim_whitespace=True, names=COL_NAMES, na_values='?')
train_auto_data, test_auto_data = train_test_split(auto_data)

### 3. Data Preparation / Feature Engineering

How to transform our data for some algorithms since some algorithms work a lot better with standardized data, some don't work at all with missing data.

Some algorithms may give a lot more weight to features which are not standardized or scaled.


#### Simple example of standardization with sklearn

In [47]:
# Basic feature engineering with sklearn example
skscaler = StandardScaler()
x1 = (np.random.randn(1000)) * np.sqrt(10) + 43
x2 = (np.random.randn(1000)) * np.sqrt(13) - 10
d = pd.DataFrame({'x1': x1, 'x2': x2}) # create an example dataframe

skscaler.fit(d)
d_standardized = skscaler.transform(d)
# d_standardized.mean() # or std

# another (shorter) way is with fit_transform
skscaler.fit_transform(d)

array([[ 1.01577506,  0.92460222],
       [ 0.71805744, -1.40551842],
       [-0.7302584 , -0.38024542],
       ...,
       [-1.98271816,  1.45922598],
       [ 0.02933269,  1.61895566],
       [ 0.25205936, -1.18726757]])

#### Another example with polynomial features

In [66]:
# Another example with poly transform
poly = PolynomialFeatures(include_bias=False)

poly.fit_transform(d);
# result feature matrix would be [1, a, b, a^2, ab, b^2]

#### Examples of a custom transformers

In [64]:
# use custom function
function_transformer = FunctionTransformer(func=np.exp, inverse_func=np.log)
transformed = function_transformer.fit_transform(d)
function_transformer.inverse_transform(transformed); # this will get it back to the original

In [None]:
def checkNA(method):
    def wrapper(self, X):
        if np.isnan(X).any():
            raise Exception('There are missing values in the data')
        else:
            method(self, X)
    return wrapper

class CustomStandardScaler(TransformerMixin, BaseEstimator):

    @checkNA
    def fit(self, X, y=None):
        X = self.validate(X, estimator=self)
        self.means = np.mean(X, axis=0)
        self.vars = np.var(X, axis=0)
        self.scale = np.sqrt(self.vars)
        
        return self
    
    def transform(self, X):
        X = self.validate_data(X, estimator=self)
        shifted_X = X - self.means
        scaled_and_shifted_X = X / self.scale
        return scaled_and_shifted_X

scaler = CustomStandardScaler()

In [68]:
class CustomTransformer():
    def fit(X, y=None):
        pass

    def transform(X):
        pass

### 4. Pipeline example

In [50]:
pipe = Pipeline([
    ('polynomization', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipe.fit(auto_data[['displacement', 'weight']], auto_data['mpg'])

# you can pass different X data here for the test data
predicted = pipe.predict(auto_data[['displacement', 'weight']]) 

### One Hot Encoder Example

In [41]:
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
one_hot_encoder.fit_transform(auto_data[['origin']]);

### Feature union use case example

![alt text](./feature-union.png)

In [40]:
baby_pipe = FeatureUnion([
    ('numerical',
    Pipeline([
        ('select_numerical', FunctionTransformer(func = lambda X: X.loc[:, ['displacement', 'weight']])),
       ('poly', PolynomialFeatures(include_bias=False)),
       ('scaler', StandardScaler())
    ])),
    ('categorical', 
    Pipeline([
        ('pass_categorical', FunctionTransformer(func = lambda X: X.loc[:, ['cylinders', 'origin', 'model_year']])),
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ]))
])

# baby_pipe.fit_transform(auto_data);

super_pipe = Pipeline([
    ('baby_pipe', baby_pipe),
    ('model', LinearRegression()) # here we can just change the model we are using with another one
])
super_pipe.fit(auto_data, auto_data['mpg'])
super_pipe.predict(auto_data); # or pass some test data and compare the differences