Cross-Validation com Pipelines

In [1]:

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# lendo o dataset
df = pd.read_csv("train.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# criando o modelo usando pipeline
model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

# validando o modelo usando 5-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_validate(model, X=df.drop(['Survived'], axis=1), y=df['Survived'], cv=kfold)
print("Average accuracy: %f (%f)" %(results['test_score'].mean(), results['test_score'].std()))

Average accuracy: 0.815956 (0.024300)


In [3]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

Grid-Search com Pipelines

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# lendo o dataset
df = pd.read_csv("train.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# criando o modelo usando pipeline
model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

# Tunando hiperparâmetros com 5-fold cross-validation e pipelines
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])

# qual o melhor parâmetro
grid.best_params_ 

{'tree__max_depth': 3}

In [None]:
Pré-processando diferentes variáveis com ColumnTransfomer

In [6]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

# lendo o dataset
df = pd.read_csv("train.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# pipeline para pré-processamento das variáveis Age e Fare
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# pipeline para pré-processamento das variáveis Sex e Embarked
cat_transformer = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder())
])

# Compondo os pré-processadores
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, ['Age', 'Fare']),
    ('cat', cat_transformer, ['Sex', 'Embarked'])
])

# criando o modelo usando pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

# Tunando hiperparâmetros com 5-fold cross-validation e pipelines
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1, return_train_score=True)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])