In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
if os.path.split(os.getcwd())[-1] == 'notebooks': os.chdir(os.path.split(os.getcwd())[-2])
print(os.getcwd())

/home/fitzaudoen/apps/gas-sensor-analytics-demo


In [4]:
sns.set()
sns.set_context('talk')

In [5]:
df = pd.read_pickle('data/processed/train_data.pkl')

## First transformer to scale numerical and encode categorical

In [29]:
transformer1 = ColumnTransformer(t1)

In [30]:
transformer1.fit_transform(X)

array([[ 0.        ,  0.        ,  0.        , ...,  1.69505184,
         1.7122244 ,  1.46249822],
       [ 1.        ,  0.        ,  0.        , ...,  2.1725067 ,
         2.15070234,  1.63184529],
       [ 0.        ,  0.        ,  0.        , ...,  1.44750144,
         1.44081016,  1.56782886],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.28644779,
        -0.22503796, -0.55015533],
       [ 0.        ,  0.        ,  0.        , ..., -0.98956103,
        -0.98120487, -0.37536644],
       [ 0.        ,  0.        ,  0.        , ...,  0.2268256 ,
         0.23004934,  0.10459919]])

In [48]:
data = df.values
batch_id = data[:,0] # unused
y = data[:,2] 
cat_cols = df.columns.values[1:2]
num_cols = df.columns.values[3:]
X = np.concatenate((data[:,3:],data[:,1:2]), axis=1)

In [None]:
cat_idx = [X.shape[1]-1]
num_idx = list(range(0,X.shape[1]-1))
t1 = [('cat', OneHotEncoder(), cat_idx), ('num', PowerTransformer(), num_idx)]

In [63]:
#t1 = [('num', PowerTransformer(), num_cols), ('cat', OneHotEncoder(), cat_cols)]
#t11 = [('cat', OneHotEncoder(), cat_cols)]

In [64]:
transformer1 = ColumnTransformer(t1)

In [72]:
X = transformer1.fit_transform(df)

## Second transformer to filter out high z-score's from numerical Xs 

In [82]:
class filterUnusualX(BaseEstimator, TransformerMixin):
    def __init__(self, z_score_max=4):
        self.z_score_max = z_score_max
    def fit(self, X, y = None):
        return self
    def transform(self, X, y=None):
        X_ = np.where(abs(X) > self.z_score_max, np.nan, X)
        return X_

In [74]:
filterx = filterUnusualX(4)

In [77]:
filterx.transform(X[:,0:10])

array([[-0.3932574 , -0.77980096, -1.54154624, ...,  1.11764159,
         0.55648485, -0.96564541],
       [ 1.48338099, -0.85586536, -1.85707906, ...,  1.59796051,
         1.38068973, -1.0888833 ],
       [-0.16319125, -0.32098801, -0.19034421, ..., -0.10727523,
        -0.241836  , -0.48959873],
       ...,
       [-0.27627355,  0.48636987,  0.12639824, ..., -1.16220009,
        -1.01538306,  0.81114535],
       [-1.67003975, -0.52678051, -0.77806699, ...,  0.59975614,
         1.24925754, -0.72598839],
       [ 0.17435949, -0.47050422,  0.39029973, ...,  0.38290253,
         0.24328705,  0.05378095]])

In [87]:
transformer2 = ColumnTransformer([('filter', filterUnusualX(), list(range(0,129)))], remainder='passthrough')

## Third transform to impute the filtered out high z-score x values

In [89]:
transformer3 = ColumnTransformer([('impute', KNNImputer(), list(range(0,129)))], remainder='passthrough')

## Pipelines

In [96]:
pl = Pipeline(steps=[
    ('scale_encode', transformer1),
    ('filter', transformer2),
    ('impute', transformer3),
    ('model', SVR(C=2000))
])

pl2 = Pipeline(steps=[
    ('scale_encode', transformer1),
    ('filter', transformer2),
    ('impute', KNNImputer()),
    ('model', SVR(C=2000))
])

In [93]:
pl.fit(df, y)

Pipeline(steps=[('scale_encode',
                 ColumnTransformer(transformers=[('num', PowerTransformer(),
                                                  array(['sample_no', 's1_f1', 's1_f2', 's1_f3', 's1_f4', 's1_f5', 's1_f6',
       's1_f7', 's1_f8', 's1_f9', 's1_f10', 's1_f11', 's1_f12', 's1_f13',
       's1_f14', 's1_f15', 's1_f16', 's2_f1', 's2_f2', 's2_f3', 's2_f4',
       's2_f5', 's2_f6', 's2_f7', 's2_f8', 's2_f9', 's2_f10', 's2_f11',
       's2_f12', 's2_f13...
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('filter', filterUnusualX(),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29, ...])])),
                ('impute',
                 ColumnTransfo

In [94]:
pl.score(df, y)

0.9877625294412958

In [97]:
pl2.fit(df, y)

Pipeline(steps=[('scale_encode',
                 ColumnTransformer(transformers=[('num', PowerTransformer(),
                                                  array(['sample_no', 's1_f1', 's1_f2', 's1_f3', 's1_f4', 's1_f5', 's1_f6',
       's1_f7', 's1_f8', 's1_f9', 's1_f10', 's1_f11', 's1_f12', 's1_f13',
       's1_f14', 's1_f15', 's1_f16', 's2_f1', 's2_f2', 's2_f3', 's2_f4',
       's2_f5', 's2_f6', 's2_f7', 's2_f8', 's2_f9', 's2_f10', 's2_f11',
       's2_f12', 's2_f13...
       's8_f5', 's8_f6', 's8_f7', 's8_f8', 's8_f9', 's8_f10', 's8_f11',
       's8_f12', 's8_f13', 's8_f14', 's8_f15', 's8_f16'], dtype=object)),
                                                 ('cat', OneHotEncoder(),
                                                  array(['Gas_ID'], dtype=object))])),
                ('filter',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('filter', filterUnusualX(),
                                               

In [98]:
pl2.score(df, y)

0.9877626884518934