# Iris

An example of creating pipesnake pipes to preprocessing data to be used by a sklearn pipeline.

In [1]:
# if you cloned the repository you can do:
import sys
sys.path.append('../')

import logging
import pandas

logging.getLogger().setLevel(logging.DEBUG)

# Load some data

More dataset are available here: https://archive.ics.uci.edu/ml/datasets.html

_Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science._

In [2]:
df = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


Create dataset and target

In [3]:
x = df[[0, 1, 2, 3]]
y = df[[4]]

In [4]:
x.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y.head()

Unnamed: 0,4
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


# Create a preprocessing pipeline using `pipesnake`

In [6]:
from pipesnake.pipe import SeriesPipe
from pipesnake.transformers.converter import Category2Number
from pipesnake.transformers.dropper import DropDuplicates
from pipesnake.transformers.imputer import KnnImputer
from pipesnake.transformers.misc import ColumnRenamer
from pipesnake.transformers.misc import ToNumpy
from pipesnake.transformers.scaler import MadScaler
from pipesnake.transformers.scaler import UnitLenghtScaler

In [7]:
my_pipe = SeriesPipe(transformers=[
    ColumnRenamer(),  # nomalize columns names
    Category2Number(y_cols=['y_0']),  # convert y to number instead of strings
    DropDuplicates(),  # drop duplicated rows and cols
    KnnImputer(x_cols='all'),  # impute missing values
    MadScaler(x_cols='all'),  # scale by feature (cols)
    UnitLenghtScaler(x_cols='all'),  # scale by feature vector (rows)
    ToNumpy(),  # returns x and y as numpy matrix 
])

In [8]:
x_new, y_new = my_pipe.fit_transform(x, y)

DEBUG:root:[series_pipe_fd9a] : fitting...
DEBUG:root:Function: timed before Memory: 99.87 MB
INFO:root:[series_pipe_fd9a] : fitting x...
INFO:root:[series_pipe_fd9a] : -> column_renamer_883c
DEBUG:root:[column_renamer_883c] : x new column names: ['x_0', 'x_1', 'x_2', 'x_3']
INFO:root:[series_pipe_fd9a] : -> category2_number_cc5b
DEBUG:root:[category2_number_cc5b] : x_cols: []
DEBUG:root:[category2_number_cc5b] : x category to number: {}
INFO:root:[series_pipe_fd9a] : -> drop_duplicates_704e
DEBUG:root:[drop_duplicates_704e] : x shape: (150, 4)
DEBUG:root:[drop_duplicates_704e] :   shape: (147, 4)
INFO:root:[series_pipe_fd9a] : -> knn_imputer_9990
DEBUG:root:[knn_imputer_9990] : x_cols: ['x_0', 'x_1', 'x_2', 'x_3']
INFO:root:[knn_imputer_9990] : current shape: (147, 4)
DEBUG:root:[category2_number_d656] : fitting...
DEBUG:root:[category2_number_d656] : x_cols: []
DEBUG:root:[category2_number_d656] : transforming...
DEBUG:root:[category2_number_d656] : x category to number: {}
DEBUG:roo

In [9]:
# x_org, y_org = my_pipe.inverse_transform(x_new, y_new)

In [10]:
x_new

array([[-0.32384134,  0.47289479, -0.61761121, -0.53856634],
       [-0.45298582,  0.        , -0.67192842, -0.58593177],
       [-0.51091461,  0.18990916, -0.64073286, -0.54070526],
       [-0.56553448,  0.09634698, -0.60818471, -0.54863409],
       [-0.34808777,  0.53371609, -0.58087099, -0.50652831],
       [-0.16473074,  0.75773518, -0.49480957, -0.39225586],
       [-0.53606947,  0.36530877, -0.59637681, -0.47277238],
       [-0.38448556,  0.39301608, -0.62022278, -0.55949345],
       [-0.61755936, -0.09018015, -0.58888649, -0.51351795],
       [-0.44286071,  0.10059697, -0.63501253, -0.62491097],
       [-0.17493212,  0.62584659, -0.5643743 , -0.50911339],
       [-0.46745061,  0.38225749, -0.58244299, -0.54417763],
       [-0.47699988,  0.        , -0.63679433, -0.60577561],
       [-0.61138925,  0.        , -0.59854959, -0.51763073],
       [ 0.        ,  0.74341073, -0.51781925, -0.42332459],
       [-0.03046523,  0.87195239, -0.39315348, -0.2901745 ],
       [-0.15866526,  0.

In [11]:
y_new

array([[ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
      

# Create the model pipeline using scikit-learn (http://scikit-learn.org)

In [12]:
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x_new, y_new.ravel(), test_size=0.25)

In [14]:
sk_pipe = Pipeline([
    ('pca', PCA()), 
    ('svc', SVC()),
])

params = {
    'pca__n_components': [2, 3],
    'svc__kernel': ['linear', 'rbf'],
}

In [15]:
model = GridSearchCV(sk_pipe, params)
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3], 'svc__kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       1.00      0.94      0.97        16
        1.0       0.80      0.62      0.70        13
        2.0       0.58      0.88      0.70         8

avg / total       0.84      0.81      0.81        37

