# Embedding `pipesnake` in `scikit-learn`

You can use `Transformer`s and `Pipe`s inside `scikit-learn` Pipelines

In [1]:
# if you cloned the repository you can do:
import sys
sys.path.append('../')

import logging
import pandas

logging.getLogger().setLevel(logging.DEBUG)

# Load some data

More dataset are available here: https://archive.ics.uci.edu/ml/datasets.html

_Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science._

In [2]:
df = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
x = df[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
y = df[[0]]

In [4]:
x.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
y.head()

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1


# Create a preprocessing pipeline using `pipesnake`

In [6]:
from pipesnake.pipe import SeriesPipe
from pipesnake.pipe import ParallelPipe

from pipesnake.transformers.combiner import Combiner
from pipesnake.transformers.misc import ColumnRenamer
from pipesnake.transformers.misc import Copycat
from pipesnake.transformers.misc import ToNumpy
from pipesnake.transformers.scaler import MadScaler
from pipesnake.transformers.scaler import UnitLenghtScaler

In [7]:
def mult(a, b):
    return a * b

my_pipe = SeriesPipe(transformers=[
    ColumnRenamer(),  # nomalize columns names
    MadScaler(x_cols='all'),  # scale by feature (cols)
    UnitLenghtScaler(x_cols='all'),  # scale by feature vector (rows)
    ParallelPipe(transformers=[
        Copycat(on_x=True, on_y=False),  # replicate existing features
        Combiner(x_cols=[['x_00', 'x_01', ], ['x_05', 'x_09',]], func=mult),  # combine few columns
    ]),
    ToNumpy(),  # returns x and y as numpy matrix 
], sklearn_output=True)

#x_new, y_new = my_pipe.fit_transform(x, y)

In [8]:
my_pipe.get_params()

{'name': 'series_pipe_1c06',
 'sklearn_output': True,
 'transformers': [ColumnRenamer(name='column_renamer_c18', postfix='', prefix='',
         sklearn_output=False),
  MadScaler(k=3.0, name='mad_scaler_31e6', skipna=True, sklearn_output=False,
       x_cols='all', y_cols=[]),
  UnitLenghtScaler(invertible=True, name='unit_lenght_scaler_7296',
           sklearn_output=False, x_cols='all', y_cols=[]),
  ParallelPipe(name='parallel_pipe_93a', reset_index=True, sklearn_output=False,
         transformers=[Copycat(name='copycat_630', on_x=True, on_y=False, sklearn_output=False), Combiner(as_dataframe=False, func=<function mult at 0x000002BA5FB81F28>,
       inv_func=None, name='combiner_9a88', reset_index=True,
       sklearn_output=False, x_cols=[['x_00', 'x_01'], ['x_05', 'x_09']],
       y_cols=[])]),
  ToNumpy(as_type=<class 'float'>, name='to_numpy_7d71', sklearn_output=False)]}

# Create the model pipeline using `scikit-learn` (http://scikit-learn.org)

In [9]:
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y.as_matrix().ravel(), test_size=0.25)

In [11]:
sk_pipe = Pipeline([
    ('pipesnake', my_pipe),
    ('pca', PCA()), 
    ('svc', SVC()),
])

params = {
    'pca__n_components': [2, 3],
    'svc__kernel': ['linear', 'rbf'],
}

In [12]:
model = GridSearchCV(sk_pipe, params)
model.fit(X_train, y_train)

DEBUG:root:[series_pipe_1c06] : fitting...
DEBUG:root:Function: timed before Memory: 103.23 MB
INFO:root:[series_pipe_1c06] : fitting x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> mad_scaler_31e6
DEBUG:root:[mad_scaler_31e6] : x_cols: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
DEBUG:root:[unit_lenght_scaler_7296] : x_cols: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 103.71 MB
INFO:root:[parallel_pipe_93a] : fitting x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88


INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:[combiner_9a88] : applying mult to ['x_00', 'x_01'] columns
DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.02 sec
DEBUG:root:Function: timed after Memory: 108.59 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d71
DEBUG:root:Function: fit_x: 0.06 sec
DEBUG:root:Function: timed after Memory: 108.61 MB
DEBUG:root:[series_pipe_1c06] : transforming...
DEBUG:root:Function: timed before Memory: 108.61 MB
INFO:root:[series_pipe_1c06] : transforming x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> mad_scaler_31e6
INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
INFO:root:[se

INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 108.73 MB
INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:[combiner_9a88] : applying mult to ['x_00', 'x_01'] columns
DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.02 sec
DEBUG:root:Function: timed after Memory: 108.73 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d71
DEBUG:root:Function: transform_x: 0.04 sec
DEBUG:root:Function: timed after Memory: 108.73 MB
DEBUG:root:[series_pipe_1c06] : fitting...
DEBUG:root:Function: timed before Memory: 108.73 MB
INFO:root:[series_pipe_1c06] : fitting x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_

DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.02 sec
DEBUG:root:Function: timed after Memory: 108.84 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d71
DEBUG:root:Function: transform_x: 0.05 sec
DEBUG:root:Function: timed after Memory: 108.84 MB
DEBUG:root:[series_pipe_1c06] : transforming...
DEBUG:root:Function: timed before Memory: 108.84 MB
INFO:root:[series_pipe_1c06] : transforming x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> mad_scaler_31e6
INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 108.84 MB
INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pip

INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 108.85 MB
INFO:root:[parallel_pipe_93a] : fitting x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:Function: fit_x: 0.00 sec
DEBUG:root:Function: timed after Memory: 108.85 MB
DEBUG:root:Function: timed before Memory: 108.85 MB
INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:[combiner_9a88] : applying mult to ['x_00', 'x_01'] columns
DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.02 sec
DEBUG:root:Function: timed after Memory: 108.85 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d71
DEBUG:root:Function: fit_x: 0.05 sec
DEBUG:root:Function: timed after Memory: 108.85 MB
DEBUG:root:[series_pipe_1c06] : transforming...
DEBUG:root:Function: timed before Memory: 108.85 

DEBUG:root:[series_pipe_1c06] : transforming...
DEBUG:root:Function: timed before Memory: 108.86 MB
INFO:root:[series_pipe_1c06] : transforming x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> mad_scaler_31e6
INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 108.86 MB
INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:[combiner_9a88] : applying mult to ['x_00', 'x_01'] columns
DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.03 sec
DEBUG:root:Function: timed after Memory: 108.86 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pipesnake', SeriesPipe(name='series_pipe_1c06', sklearn_output=True,
      transformers=[ColumnRenamer(name='column_renamer_c18', postfix='', prefix='',
       sklearn_output=False), MadScaler(k=3.0, name='mad_scaler_31e6', skipna=True, sklearn_output=False,
     x_cols='all', y_cols=[]), U...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3], 'svc__kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

DEBUG:root:[series_pipe_1c06] : transforming...
DEBUG:root:Function: timed before Memory: 109.02 MB
INFO:root:[series_pipe_1c06] : transforming x...
INFO:root:[series_pipe_1c06] : -> column_renamer_c18
DEBUG:root:[column_renamer_c18] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12']
INFO:root:[series_pipe_1c06] : -> mad_scaler_31e6
INFO:root:[series_pipe_1c06] : -> unit_lenght_scaler_7296
INFO:root:[series_pipe_1c06] : -> parallel_pipe_93a
DEBUG:root:Function: timed before Memory: 109.02 MB
INFO:root:[parallel_pipe_93a] : transforming x...
INFO:root:[parallel_pipe_93a] : -> copycat_630
INFO:root:[parallel_pipe_93a] : -> combiner_9a88
DEBUG:root:[combiner_9a88] : applying mult to ['x_00', 'x_01'] columns
DEBUG:root:[combiner_9a88] : applying mult to ['x_05', 'x_09'] columns
DEBUG:root:Function: transform_x: 0.02 sec
DEBUG:root:Function: timed after Memory: 109.02 MB
INFO:root:[series_pipe_1c06] : -> to_numpy_7d

             precision    recall  f1-score   support

          1       1.00      0.94      0.97        16
          2       0.87      1.00      0.93        13
          3       1.00      0.94      0.97        16

avg / total       0.96      0.96      0.96        45

