In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [153]:
if os.path.split(os.getcwd())[-1] == 'notebooks': os.chdir(os.path.split(os.getcwd())[-2])
print(os.getcwd())

/home/fitzaudoen/apps/gas-sensor-analytics-demo


In [154]:
sns.set()
sns.set_context('talk')

This notebook shows the steps to come up with the preprocessing pipeline implemented in src.features.build_features. Two preprocessing pipelines are needed:
<ol>
    <li>for classification, using only the sensor values and sample no to predict the gas id</li>
    <li>for regression using the sensor values and the gas id to predict the concentration </li>
</ol>

## Set up data

In [155]:
df = pd.read_pickle('data/processed/train_data.pkl')

In [156]:
data = df.values
batch_id = data[:,0] # unused
y = data[:,2] 
cat_cols = df.columns.values[1:2]
num_cols = df.columns.values[3:]
X = np.concatenate((data[:,3:],data[:,1:2]), axis=1)

## First transformer to scale numerical and encode categorical

In [157]:
cat_idx = [X.shape[1]-1]
num_idx = list(range(0,X.shape[1]-1))
t1 = [('cat', OneHotEncoder(), cat_idx), ('num', PowerTransformer(), num_idx)]

In [158]:
transformer1 = ColumnTransformer(t1)

In [159]:
X_trans = transformer1.fit_transform(X)

## Second transformer to filter out high z-score's from numerical Xs 

In [164]:
# Custom operator for filtering scaled data by Z-value (same as just filtering the value)
class filterUnusualX(BaseEstimator, TransformerMixin):
    def __init__(self, z_score_max=4):
        self.z_score_max = z_score_max
    def fit(self, X, y = None):
        return self
    def transform(self, X, y=None):
        X_ = np.where(abs(X) > self.z_score_max, np.nan, X)
        return X_

In [165]:
filterx = filterUnusualX(4)

In [166]:
filterx.transform(X_trans[:,0:10])

array([[ 0.        ,  0.        ,  0.        , ..., -0.77980096,
        -1.54154624, -1.06828845],
       [ 1.        ,  0.        ,  0.        , ..., -0.85586536,
        -1.85707906, -1.24461614],
       [ 0.        ,  0.        ,  0.        , ..., -0.32098801,
        -0.19034421,  0.2637914 ],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.48636987,
         0.12639824,  0.83204111],
       [ 0.        ,  0.        ,  0.        , ..., -0.52678051,
        -0.77806699, -0.57991495],
       [ 0.        ,  0.        ,  0.        , ..., -0.47050422,
         0.39029973, -0.22833753]])

In [167]:
transformer2 = ColumnTransformer([('filter', filterUnusualX(), list(range(0,129)))], remainder='passthrough')

## Preprocess pipeline

In [171]:
# Create pipeline, third step is KNN imputer to impute the outliers that were removed

# Classifcation pipeline that only has numeric columns
pipeline_classifcation = Pipeline(steps=[
    ('scale', PowerTransformer()),
    ('filter', filterUnusualX()),
    ('impute', KNNImputer())
])


# Regression pipeline that uses transformers for categorical and and numeric
pipeline_regression = Pipeline(steps=[
    ('scale_encode', transformer1),
    ('filter', transformer2),
    ('impute', KNNImputer())
])


In [169]:
X.shape

(8346, 130)

In [170]:
pipeline.fit_transform(X)

array([[ 0.        ,  0.        ,  0.        , ...,  1.69505184,
         1.7122244 ,  1.46249822],
       [ 1.        ,  0.        ,  0.        , ...,  2.1725067 ,
         2.15070234,  1.63184529],
       [ 0.        ,  0.        ,  0.        , ...,  1.44750144,
         1.44081016,  1.56782886],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.28644779,
        -0.22503796, -0.55015533],
       [ 0.        ,  0.        ,  0.        , ..., -0.98956103,
        -0.98120487, -0.37536644],
       [ 0.        ,  0.        ,  0.        , ...,  0.2268256 ,
         0.23004934,  0.10459919]])