# Pipelines

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
from yellowbrick.model_selection import (
    ValidationCurve,
)

url = ("https://hbiostat.org/data/repo/titanic3.csv")

df = pd.read_csv(url)
org_df = df.copy()

def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

def get_train_test_X_y(
    df, y_col, size=0.3, std_cols=None
):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=size, random_state=42
    )
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()

    fitted = fi.fit_transform(X_train[num_cols])
    X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(num_cols)})
    test_fit = fi.transform(X_test[num_cols])
    X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(num_cols)})
    if std_cols:
        std = preprocessing.StandardScaler()
        fitted = std.fit_transform(X_train[std_cols])
        X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(std_cols)})
        test_fit = std.transform(X_test[std_cols])
        X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(std_cols)})

    return X_train, X_test, y_train, y_test

ti_df = tweak_titanic(df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(
    ti_df, "survived", std_cols=std_cols
)

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [3]:
class TitanicTransformer(BaseEstimator, TransformerMixin):
    
    def transform(self, X):
        X = tweak_titanic(X)
        X = X.drop(columns="survived")
        return X
    
    def fit(self, X, y):
        return self
    
pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("rf", RandomForestClassifier()),
        
    ]
)

In [4]:
from sklearn.model_selection import train_test_split

In [6]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    org_df,
    org_df.survived,
    test_size=0.3,
    random_state=42,
)

pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.8040712468193384

Os pipelines podem ser usados na busca em grade. O parâmetro `param_grid` deve ter parametros prefixados pelo nome da etapa do pipeline, seguido de dois underscores

In [11]:
params = {
    "rf__max_features": [0.4, "auto", 0.6],
    "rf__n_estimators": [15,100, 200, 300],
}

grid = model_selection.GridSearchCV(
    pipe, param_grid=params, cv=3
)

grid.fit(org_df, org_df.survived)

12 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/home/pcmoraes/Área de Trabalho/Editores de código/Inteligencia-Artificial/Livros/Machine Learning - Guia de Referência Rápida/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pcmoraes/Área de Trabalho/Editores de código/Inteligencia-Artificial/Livros/Machine Learning - Guia de Referência Rápida/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/pcmoraes/Área de Trabal

In [12]:
grid.best_params_

{'rf__max_features': 0.4, 'rf__n_estimators': 300}

In [13]:
pipe.set_params(**grid.best_params_)
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7913486005089059

## Pipelines de Regressão

In [14]:
import pandas as pd
from sklearn import (
    model_selection,
    preprocessing
)
from sklearn.model_selection import train_test_split
from pathlib import Path

In [15]:
ROOT_DIR = Path.cwd().parent
DATA = ROOT_DIR / 'datasets' / 'housing.csv'

In [16]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(DATA, header=None, delimiter=r"\s+", names=column_names)

In [18]:
# a target do dataset de boston é o MEDV
bos_y = df.MEDV
bos_X = df.drop('MEDV', axis=1)
bos_X_train, bos_X_test, bos_y_train, bos_y_test = train_test_split(bos_X, bos_y, test_size=0.3, random_state=42)

In [17]:
from sklearn.linear_model import LinearRegression

In [20]:
reg_pipe = Pipeline(
    [
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("lr", LinearRegression()),
    ]
)

In [21]:
reg_pipe.fit(bos_X_train, bos_y_train)

In [22]:
reg_pipe.score(bos_X_test, bos_y_test)

0.7112260057484932

Para extrair partes do pipeline a fim de analisar suas propriedades pode-se usar o atributo `named_steps` do pipeline

In [23]:
reg_pipe.named_steps["lr"].intercept_

23.01581920903955

In [24]:
reg_pipe.named_steps["lr"].coef_

array([-1.10834602,  0.80843998,  0.34313466,  0.81386426, -1.79804295,
        2.913858  , -0.29893918, -2.94251148,  2.09419303, -1.44706731,
       -2.05232232,  1.02375187, -3.88579002])

In [25]:
from sklearn import metrics

In [26]:
metrics.mean_squared_error(
    bos_y_test, reg_pipe.predict(bos_X_test)
)

21.517444231177215

## Pipelines de PCA

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca_pipe = Pipeline(
    [
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("pca", PCA())
    ]
)

In [29]:
X_pca = pca_pipe.fit_transform(X)

In [31]:
pca_pipe.named_steps["pca"].explained_variance_ratio_

array([0.23922843, 0.21616844, 0.19231579, 0.10464906, 0.08154794,
       0.07272212, 0.05130716, 0.04206105])