# Machine Learning Preprocessing Workshop

### 1. Imports

In [19]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor

from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score, KFold, LeaveOneOut, train_test_split

from sklearn.metrics import mean_squared_error

### 2. Load the data

| variable     | type       | description                   |
|--------------|------------|-------------------------------|
| mpg          | continuous | miles per gallon              |
| cylinders    | discrete   | number of cylinders of engine |
| displacement | continuous | engine volume                 |
| horsepower   | continuous | obvious                       |
| weight       | continuous | weight in kilos               |
| acceleration | continuous | obvious                       |
| model_year   | discrete   | obvious                       |
| origin       | discrete   | 1. US, 2. EU, 3. ASIA         |
| name         | string     | unique name of the automobile |

In [3]:
DATA_PATH = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
COL_NAMES = ['mpg', 
             'cylinders',
             'displacement',
             'horsepower',
             'weight',
             'acceleration',
             'model_year',
             'origin',
             'name']

auto_data = pd.read_csv(DATA_PATH, delim_whitespace=True, names=COL_NAMES, na_values='?')
train_auto_data, test_auto_data = train_test_split(auto_data)

auto_data = auto_data.dropna()

### 3. Data Preparation / Feature Engineering

How to transform our data for some algorithms since some algorithms work a lot better with standardized data, some don't work at all with missing data.

Some algorithms may give a lot more weight to features which are not standardized or scaled.


#### Simple example of standardization with sklearn

In [4]:
# Basic feature engineering with sklearn example
skscaler = StandardScaler()
x1 = (np.random.randn(1000)) * np.sqrt(10) + 43
x2 = (np.random.randn(1000)) * np.sqrt(13) - 10
d = pd.DataFrame({'x1': x1, 'x2': x2}) # create an example dataframe

skscaler.fit(d)
d_standardized = skscaler.transform(d)
# d_standardized.mean() # or std

# another (shorter) way is with fit_transform
skscaler.fit_transform(d)

array([[ 0.21413855, -0.24581202],
       [ 2.53598759, -0.40280533],
       [-0.20478111, -1.66507591],
       ...,
       [ 0.0131445 ,  0.84287472],
       [ 0.12512599, -0.62196627],
       [-1.23560116,  1.67804174]])

#### Another example with polynomial features

In [5]:
# Another example with poly transform
poly = PolynomialFeatures(include_bias=False)

poly.fit_transform(d);
# result feature matrix would be [1, a, b, a^2, ab, b^2]

#### Examples of a custom transformers

In [6]:
# use custom function
function_transformer = FunctionTransformer(func=np.exp, inverse_func=np.log)
transformed = function_transformer.fit_transform(d)
function_transformer.inverse_transform(transformed); # this will get it back to the original

In [7]:
def checkNA(method):
    def wrapper(self, X):
        if np.isnan(X).any():
            raise Exception('There are missing values in the data')
        else:
            method(self, X)
    return wrapper

class CustomStandardScaler(TransformerMixin, BaseEstimator):

    @checkNA
    def fit(self, X, y=None):
        X = self.validate(X, estimator=self)
        self.means = np.mean(X, axis=0)
        self.vars = np.var(X, axis=0)
        self.scale = np.sqrt(self.vars)
        
        return self
    
    def transform(self, X):
        X = self.validate_data(X, estimator=self)
        shifted_X = X - self.means
        scaled_and_shifted_X = X / self.scale
        return scaled_and_shifted_X

scaler = CustomStandardScaler()

In [68]:
class CustomTransformer():
    def fit(X, y=None):
        pass

    def transform(X):
        pass

### 4. Pipeline example

In [8]:
pipe = Pipeline([
    ('polynomization', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipe.fit(train_auto_data[['displacement', 'weight']], train_auto_data['mpg'])

# you can pass different X data here for the test data
predicted = pipe.predict(test_auto_data[['displacement', 'weight']]) 

### One Hot Encoder Example

In [9]:
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
one_hot_encoder.fit_transform(train_auto_data[['origin']]);

### Feature union use case example

![alt text](./feature-union.png)

In [10]:
baby_pipe = FeatureUnion([
    ('numerical',
    Pipeline([
        ('select_numerical', FunctionTransformer(func = lambda X: X.loc[:, ['displacement', 'weight']])),
       ('poly', PolynomialFeatures(include_bias=False)),
       ('scaler', StandardScaler())
    ])),
    ('categorical', 
    Pipeline([
        ('pass_categorical', FunctionTransformer(func = lambda X: X.loc[:, ['cylinders', 'origin', 'model_year']])),
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ]))
])

# baby_pipe.fit_transform(auto_data);

super_pipe = Pipeline([
    ('baby_pipe', baby_pipe),
    ('model', LinearRegression()) # here we can just change the model we are using with another one
])
super_pipe.fit(train_auto_data, train_auto_data['mpg'])
super_pipe.predict(test_auto_data); # or pass some test data and compare the differences

### Leave one out cross validation example

In [46]:
# The next code is terrible, just testing some things
loo = LeaveOneOut()
y_true, y_pred = list(), list()
for train_ix, test_ix in loo.split(auto_data):
#     print(auto_data.iloc[train_ix])
    X_train, X_test = auto_data.iloc[train_ix], auto_data.iloc[test_ix, :]
    y_train, y_test = auto_data.iloc[train_ix][['mpg']], auto_data.iloc[test_ix][['mpg']]
    # fit model
    super_pipe.fit(X_train, y_train)
    # evaluate model
    yhat = super_pipe.predict(X_test)
    # store
    y_true.append(y_test.iat[0, 0])
    y_pred.append(yhat[0][0])
# calculate accuracy
accuracy_score = mean_squared_error(y_true, y_pred)
print('Accuracy: %.3f' % accuracy_score)

# loocv has a shortcut which is probably used in the next cell

Accuracy: 7.606


In [24]:
kfold = KFold(n_splits=10)
model_kfold = LinearRegression()
poly = PolynomialFeatures(include_bias=False, degree=2)

poly_hp = poly.fit_transform(auto_data[['horsepower']])

#poly_hp.shape
results_kfold = cross_val_score(model_kfold, poly_hp, auto_data[['mpg']], cv=kfold);
results_kfold

array([ 0.52961134,  0.57544805,  0.33112653,  0.44495076,  0.43675032,
        0.74897269,  0.72410644,  0.59808258, -0.45954765, -0.0718335 ])