# Pipeline
- 

# ColumnTransformer
- 

In [3]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [2]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [4]:
# Pipeline
# Usually used to combine an Estimator (Regressor, Classifier) with preceding Transformers. Is the called a Composite Estimator
# Takes a list of 'name', 'estimator' tuples
# Exposes the same methods as the final Estimator
# All but the last Estimator need to be Tansformers, the last Estimator can be a Transformer
# The Pipelines .fit() method calls .fit_transform() on all but the last Estimators and .fit() on the last Estimator
# The Pipelines .fit_transform() method calls .fit_transform() on all Estimators, so for this the last Estimator needs to be an Transformer as well
# Calling .predict() on a Pipeline should call .transform() for all but the last Estimators and .predict() on the last one
# The Hyperparameters of all contained Estimators can be optimized together by GridSerach

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler())
    ]
)

In [None]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [None]:
# ColumnTransformer
# Takes a list of 'name', 'transformer' and a 'list column names' tuples
# On .transform() or .fit_transform(), it applies the transformeres to the given columns and chains the results through


numeric_features = ["age", "fare"]
categorical_features = ["embarked", "sex", "pclass"]
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model score: 0.790


In [None]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [8]:
clf.predict()

array(['0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0',
       '1', '1', '0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '1',
       '0', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0',
       '0', '1', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '1',
       '1', '0', '0', '0', '1', '1', '0', '0', '0', '1', '1', '0', '1',
       '1', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1',
       '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0',
       '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0',
       '0', '1', '0', '1', '0', '0', '0', '1', '0', '1', '0', '1', '0',
       '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0',
       '1', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0',
       '0', '0', '0', '1', '0', '1', '0', '0', '1', '1', '1', '1

In [9]:
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790
