# Pipelining a Supervised learning model

Library imports

In [1]:
from civismlext.stacking import StackedClassifier
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

  from numpy.core.umath_tests import inner1d


Loading the data set

In [2]:
X, y = load_iris(return_X_y=True)

## Simple example using LogisticRegression

In [3]:
train_X, test_X, train_y, test_y = train_test_split(X, y)
model = LogisticRegression(solver='lbfgs', multi_class='ovr')
model.fit(train_X, train_y)
scores = model.predict(test_X)
print(scores)

[2 1 0 1 0 0 1 0 2 2 0 1 2 0 0 1 0 0 1 1 1 0 1 0 2 1 1 2 2 1 1 0 1 1 2 0 1
 0]


## Using Pipelines for ETL-like jobs

In [4]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

estimator_list = [
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(solver='lbfgs', multi_class='ovr')),
]

pipeline = Pipeline(estimator_list)
pipeline.fit(train_X, train_y)
scores = pipeline.predict(test_X)
print(scores)

[0 0 2 2 1 0 1 2 1 2 0 0 0 0 2 0 2 1 0 0 2 1 0 1 0 2 1 1 1 1 0 0 1 1 2 2 1
 2]


## Using ensemble learning methodlogies

In [5]:
def score_iris(estimator):
    X, y = load_iris(return_X_y=True)
    train_X, test_X, train_y, test_y = train_test_split(X, y)
    estimator_list = [
        ('scaler', StandardScaler()),
        ('custom_estimator', estimator),
    ]
    
    pipeline = Pipeline(estimator_list)
    pipeline.fit(train_X, train_y)
    scores = pipeline.predict(test_X)
    return pipeline, scores


model = GradientBoostingClassifier(n_estimators=50)
pipeline, scores = score_iris(model)
print(scores)
print(pipeline.steps)

[2 1 2 2 0 1 2 1 2 0 2 2 1 2 0 2 0 1 0 2 2 0 1 0 1 0 2 1 1 2 2 1 0 0 0 1 0
 2]
[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('custom_estimator', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]


In [6]:
new_estimator_list = [
    ('imputer', Imputer()),
    ('gradient_boosting_classifier', GradientBoostingClassifier()),
]
pipeline_estimator = Pipeline(new_estimator_list)
pipeline, scores = score_iris(pipeline_estimator)
print(pipeline)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('custom_estimator', Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('gradient_boosting_classifier', GradientBoostingClassifier(criterion='friedm...   presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]))])


Using StackedClassifier

In [7]:
estimator_list = [
    ('logistic', LogisticRegression(solver='lbfgs', multi_class='ovr')),
    ('random_forest', RandomForestClassifier(n_estimators=10)),
    ('gradient_boosting_classifier', GradientBoostingClassifier()),
    ('meta', LogisticRegression(solver='lbfgs', multi_class='ovr')),
]

stacker = StackedClassifier(estimator_list)
stacker.fit(X, y)
scores = stacker.predict(X)
print(scores)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
