In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

## Obtendo os Dados

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Limpando Colunas

In [5]:
train = train[["PassengerId", "Survived", "Pclass", "Sex", "Age"]]
test = test[["PassengerId", "Pclass", "Sex", "Age"]]

## Tratando Gênero

In [6]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Sex_female,Sex_male
0,1,0,3,22.0,0,1
1,2,1,1,38.0,1,0
2,3,1,3,26.0,1,0
3,4,1,1,35.0,1,0
4,5,0,3,35.0,0,1


## Imputando Idades Faltantes

In [8]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
Sex_female       0
Sex_male         0
dtype: int64

In [9]:
train_clean = train.copy()
test_clean = test.copy()
imputer = SimpleImputer()

train_clean[["Age"]] = imputer.fit_transform(train[["Age"]])
test_clean[["Age"]] = imputer.fit_transform(test[["Age"]])

## Escalonando as Idades

In [10]:
scaler = StandardScaler()
train_clean[["Age"]] = scaler.fit_transform(train_clean[["Age"]])
test_clean[["Age"]] = scaler.fit_transform(test_clean[["Age"]])

In [11]:
x = train_clean.drop("Survived", axis=1)
y = train_clean["Survived"]

In [14]:
tree = DecisionTreeClassifier(max_depth=3)

In [15]:
tree.fit(x, y)

DecisionTreeClassifier(max_depth=3)

In [18]:
tree.score(x, y)

0.8114478114478114

In [19]:
result = pd.DataFrame()
result["PassengerId"] = test_clean["PassengerId"]
result["Survived"] = tree.predict(test_clean)

## Usando um Pipeline

In [20]:
train = pd.read_csv("train.csv")
train = train[["Pclass", "Sex", "Age", "Survived"]]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(
    train.drop(["Survived"], axis=1),
    train["Survived"],
    test_size=0.2,
)

In [23]:
numerical_cleaner = Pipeline(
    steps=[
        ("impute", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

Processando a coluna `Age` de uma vez só, com o pipeline:

In [None]:
numerical_cleaner.

In [27]:
numerical_cleaner.fit_transform(train[["Age"]])

array([[-0.5924806 ],
       [ 0.63878901],
       [-0.2846632 ],
       [ 0.40792596],
       [ 0.40792596],
       [ 0.        ],
       [ 1.87005862],
       [-2.13156761],
       [-0.20770885],
       [-1.20811541],
       [-1.97765891],
       [ 2.17787603],
       [-0.7463893 ],
       [ 0.71574336],
       [-1.20811541],
       [ 1.94701297],
       [-2.13156761],
       [ 0.        ],
       [ 0.10010856],
       [ 0.        ],
       [ 0.40792596],
       [ 0.33097161],
       [-1.13116105],
       [-0.1307545 ],
       [-1.66984151],
       [ 0.63878901],
       [ 0.        ],
       [-0.82334365],
       [ 0.        ],
       [ 0.        ],
       [ 0.79269771],
       [ 0.        ],
       [ 0.        ],
       [ 2.79351083],
       [-0.1307545 ],
       [ 0.94660642],
       [ 0.        ],
       [-0.66943495],
       [-0.900298  ],
       [-1.20811541],
       [ 0.79269771],
       [-0.20770885],
       [ 0.        ],
       [-2.05461326],
       [-0.82334365],
       [ 0

Temos alguma diferença entre o gerado pelo pipe o que fizemos na mão?

In [28]:
(numerical_cleaner.fit_transform(train[["Age"]]) == train_clean[["Age"]]).all()

Age    True
dtype: bool

Não!

### Limitando as Colunas

Como garantir que o pipeline `numerical_cleaner` se aplique apenas à age? Com um [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)!

In [29]:
preprocessors = ColumnTransformer(
    transformers=[
        ("Numerical Cleaner", numerical_cleaner, ["Age"]),
        ("Category Encoding", OneHotEncoder(), ["Sex"]),
    ]
)

In [30]:
preprocessors.fit_transform(train)

array([[-0.5924806 ,  0.        ,  1.        ],
       [ 0.63878901,  1.        ,  0.        ],
       [-0.2846632 ,  1.        ,  0.        ],
       ...,
       [ 0.        ,  1.        ,  0.        ],
       [-0.2846632 ,  0.        ,  1.        ],
       [ 0.17706291,  0.        ,  1.        ]])

In [31]:
model = Pipeline(
    steps=[
        ("Pré-processamento", preprocessors),
        ("Árvore de Decisão", DecisionTreeClassifier(max_depth=5)),
    ]
)

In [39]:
model.fit(x_train, y_train)

Pipeline(steps=[('Pré-processamento',
                 ColumnTransformer(transformers=[('Numerical Cleaner',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age']),
                                                 ('Category Encoding',
                                                  OneHotEncoder(), ['Sex'])])),
                ('Árvore de Decisão', DecisionTreeClassifier(max_depth=5))])

In [40]:
model.score(x_train, y_train)

0.8047752808988764

In [41]:
model.score(x_test, y_test)

0.8268156424581006

In [147]:
from joblib import dump
dump(model, "titanic.joblib")

['titanic.joblib']

## Visualizando o Pipeline

In [42]:
from sklearn import set_config
set_config(display='diagram')

In [44]:
numerical_cleaner

In [45]:
preprocessors

In [46]:
model

In [128]:
preprocessors

In [129]:
numerical_cleaner