### la pipeline
- permette di incapsulare la preparazione dati e il modello in un unico oggetto
- utile perchè
  - il codice più semplice
  - possiamo salvare e ricaricare un solo oggetto
  - evita leak tra train e test
  - si usa agevolmente in combinazione con la grid
- nel modulo sul web c'è l'esempio con la sentiment analysis, qui lo facciamo con una regressione

In [31]:
import pandas as pd

from sklearn.datasets import fetch_california_housing

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn import set_config
set_config(display='diagram')

In [2]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [3]:
X.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14297,2.2804,42.0,3.82801,1.125307,1227.0,3.014742,32.72,-117.14
13805,2.825,27.0,5.339882,1.145383,1472.0,2.891945,34.91,-117.03
635,3.4643,36.0,4.970149,0.985075,366.0,2.731343,37.71,-122.16
3410,4.4375,24.0,5.335423,1.055381,2310.0,2.413793,34.26,-118.32
19,2.6033,52.0,5.465455,1.083636,690.0,2.509091,37.84,-122.27


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
steps = [
    ( 'scaler', MinMaxScaler() ),
    ( 'normalizer', Normalizer() ),
    ( 'model', KNeighborsRegressor() )
]

In [6]:
pipeline = Pipeline(steps)

In [7]:
pipeline

Pipeline(steps=[('scaler', MinMaxScaler()), ('normalizer', Normalizer()),
                ('model', KNeighborsRegressor())])

In [8]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('normalizer', Normalizer()),
                ('model', KNeighborsRegressor())])

In [9]:
pipeline.steps

[('scaler', MinMaxScaler()),
 ('normalizer', Normalizer()),
 ('model', KNeighborsRegressor())]

In [10]:
pipeline.named_steps

{'scaler': MinMaxScaler(),
 'normalizer': Normalizer(),
 'model': KNeighborsRegressor()}

In [32]:
pipeline

In [11]:
pipeline['scaler'].get_params()

{'copy': True, 'feature_range': (0, 1)}

In [23]:
pipeline['model'].get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
# uso direttmente il modello
pipeline['model'].predict(X_test)

array([2.4262, 2.4262, 2.4262, ..., 2.4262, 2.4262, 2.4262])

In [26]:
# uso dataprep e modello!
pipeline.predict(X_test)

array([1.448   , 1.361   , 3.189   , ..., 1.7514  , 2.461   , 3.876602])

In [27]:
pipeline.score(X_test, y_test)

0.6541329067673627