## Pipeline
可以使用  
```python
Pipeline([('dataprocess',dataprocessor()),('testmodel', learner())])
```

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


Transformer:
>需要繼承 
> - sklearn.base.TransformerMixin 
> - sklearn.base.BaseEstimator 

>需實作:  
> - fit(self, datax, datay=None, **params)
> - transform(self, datax, datay=None, **params)

Learner:
>需要繼承 
> - sklearn.base.BaseEstimator 

>需實作:  
> - fit(self, datax, datay=None, **params)
> - predict(self, datay)




In [92]:
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor

class dataprocessor(TransformerMixin,BaseEstimator):
    def __init__(self):
        self.total = 0.
    def fit(self, datax, datay=None, **params):
        #print("dataprocessor: 1) fit")
        self.total = sum(datax)
        return self
    def transform(self, datax, datay=None, **params):
        #print("dataprocessor: 2) after fit, call transform")
        return [i/self.total for i in datax]

class learner(BaseEstimator):
    def __init__(self):
        self.condiction = 0.
        pass
    def fit(self, datax, datay=None, **params):
        #print("learner: 1) fit")
        self.condiction = sum(datax)/len(datax)
        return self
    def predict(self, datax):
        predicted = datax > self.condiction
        return [int(i) for i in predicted]

ml_pipe = Pipeline([('dataprocess',dataprocessor()),('testmodel', learner())])    
X=np.array([1,2,3])
y=np.array([1,0,1])
ml_pipe.fit(X,y)
ml_pipe.predict(X)


[0, 0, 1]

### GridSearchCV
GridSearchCV的param_grid會每次init物件完會呼叫set_params代入參數, 再呼叫fit跟transform

In [267]:
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor

#1) init 2) set_params 2) fit
class dataprocessor(TransformerMixin,BaseEstimator):
    def __init__(self, bias=1, method='mean'):
        self.options = {'method': method, 'bias':bias} 
        self.total = 0.
    def fit(self, datax, datay=None, **params):
        self.total = sum(datax)
        return self
    def transform(self, datax, datay=None, **params):
        #print("dataprocessor: 2) after fit, call transform")
        if (self.options['method'] == 'mean'):
            return [(i/self.total)+self.options['bias'] for i in datax]
        else:
            return [i for i in datax]      
    def set_params(self,**params):
        for p in params:
            self.options[p] = params[p]
    def get_params(self,deep=True):
        return self.options
                
class learner(BaseEstimator):
    def __init__(self, alpha=0.1):
        self.options = {'alpha':alpha} 
        self.condiction = 0.
        pass
    def fit(self, datax, datay=None, **params):
        #print("learner: 1) fit")
        self.condiction = sum(datax)/len(datax) + self.options['alpha']
        return self
    def predict(self, datax):
        predicted = datax > self.condiction
        return [int(i) for i in predicted]
    def score(self, X, y=None):
        return np.random.rand()
    def set_params(self,**params):
        for p in params:
            self.options[p] = params[p]
    def get_params(self,deep=True):
        return self.options

ml_pipe = Pipeline([('dataprocess',dataprocessor()),('testmodel', learner())])    

X=np.array([1,2,3,4,5,6,7,8,9,10,11])
y=np.array([1,0,1,1,1,1,1,1,1,1,1])

param_grid = {
    'dataprocess__bias': [1,2],
    'dataprocess__method': ['mean', 'ori'],
    'testmodel__alpha': [.001, 0.01],
    }
kf = KFold(n_splits=3, shuffle=True, random_state=123)
# estimator need provide a score function
gsearch = GridSearchCV(estimator=ml_pipe, param_grid=param_grid, cv=kf,iid=False, n_jobs=1)
gsearch.fit(X, y)
print(gsearch.best_estimator_)
#print(gsearch.cv_results_)

Pipeline(memory=None,
     steps=[('dataprocess', dataprocessor(bias=2, method='ori')), ('testmodel', learner(alpha=0.01))])


### 實現支援GridSearch的類別

In [41]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVC

In [43]:
class dataProcess:
    def __init__(self,aaa=0):
        self.options = {'aaa':aaa}
        self.caterogy_imputer_step = ('caterogy_imputer_step', SimpleImputer(strategy='constant',fill_value='MISSING')) 
        self.caterogy_encoder_step = ('caterogy_encoder_step', OneHotEncoder(sparse=False,handle_unknown='ignore')) 
        self.num_imputer_step = ('num_imputer_step', SimpleImputer(strategy='median'))
        self.num_scaler_step = ('num_scaler_step', StandardScaler())
        self.something = {
            'caterogy_pipe': Pipeline([self.caterogy_imputer_step, self.caterogy_encoder_step]),
            'num_pipe': Pipeline([self.num_imputer_step, self.num_scaler_step])
        }
    def fit(self, datax, datay=None):
        # X is pandas type
        kinds = np.array([dt.kind for dt in datax.dtypes])
        is_num = kinds != 'O'
        numeric_column_name = datax.columns.values[is_num]
        category_column_name = datax.columns.values[~is_num]
        self.ct_transformer = [('caterogy_transformer',self.something['caterogy_pipe'], category_column_name),('num_transformer',self.something['num_pipe'],numeric_column_name)]
        self.ct = ColumnTransformer(transformers=self.ct_transformer)        
    def transform(self, datax, datay=None):
        return self.ct.transform(datax)
    def fit_transform(self, datax, datay=None):
        self.fit(datax)
        return self.ct.fit_transform(datax)
    def set_params(self,**params):
        for p in params:
            if p in self.options:
                self.options[p] = params[p]
            else:
                s = p.split("__",1)
                hyperparameters = {}
                hyperparameters[s[1]] = params[p]
                self.something[s[0]].set_params(**hyperparameters)              
    def get_params(self,deep=True):
        return self.options
    def __repr__(self):
        msg="\n"
        msg += str(self.something['caterogy_pipe'].named_steps['caterogy_imputer_step'])
        msg += "\n"
        msg += str(self.something['caterogy_pipe'].named_steps['caterogy_encoder_step'])
        msg += "\n"
        msg += str(self.something['num_pipe'].named_steps['num_imputer_step'])
        msg += "\n"
        msg += str(self.something['num_pipe'].named_steps['num_scaler_step'])
        return msg
    
from sklearn.datasets import load_iris
data = load_iris()

X = pd.DataFrame(data.data)
y = data.target
param_grid = {
    'dataProcess__caterogy_pipe__caterogy_imputer_step__fill_value': ['AAA','BBB'],
    'dataProcess__num_pipe__num_imputer_step__strategy': ['median','mean'],
    'SVC__kernel': ['linear','poly'],
    'SVC__C': [0.01,0.1,1,10,100,1000],
}

ml_pipe = Pipeline([('dataProcess',dataProcess()), ('SVC',  SVC())])
kf = KFold(n_splits=3, shuffle=True, random_state=123)
# estimator need provide a score function
gsearch = GridSearchCV(estimator=ml_pipe, param_grid=param_grid, cv=kf,iid=False, n_jobs=1)
gsearch.fit(X, y)

gsearch.best_params_, gsearch.best_score_

({'SVC__C': 1,
  'SVC__kernel': 'linear',
  'dataProcess__caterogy_pipe__caterogy_imputer_step__fill_value': 'AAA',
  'dataProcess__num_pipe__num_imputer_step__strategy': 'median'},
 0.9733333333333333)

In [44]:
print(gsearch.best_estimator_)

Pipeline(memory=None,
     steps=[('dataProcess',
SimpleImputer(copy=True, fill_value='AAA', missing_values=nan,
       strategy='constant', verbose=0)
OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)
SimpleImputer(cop...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])
