In [14]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import mlflow

np.random.seed(0)

Load data from https://www.openml.org/d/40945

In [15]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

Use ColumnTransformer by selecting column by names

We will train our classifier with the following features:

Numeric Features:

    age: float;

    fare: float.

Categorical Features:

    embarked: categories encoded as strings {'C', 'S', 'Q'};

    sex: categories encoded as strings {'female', 'male'};

    pclass: ordinal integers {1, 2, 3}.

We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature.

In [16]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

Append classifier to preprocessing pipeline. Now we have a full prediction pipeline.

In [17]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [18]:
display(clf)

Using the prediction pipeline in a grid search

Grid search can also be performed on the different preprocessing steps defined in the ColumnTransformer object, together with the classifier’s hyperparameters as part of the Pipeline. We will search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression using GridSearchCV.

In [19]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "classifier__C": [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

Calling ‘fit’ triggers the cross-validated search for the best hyper-parameters combination:

In [20]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

print(f"Internal CV score: {grid_search.best_score_:.3f}")

Best params:
{'classifier__C': 0.1, 'preprocessor__num__imputer__strategy': 'mean'}
Internal CV score: 0.783


In [11]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_preprocessor__num__imputer__strategy",
        "param_classifier__C",
    ]
]

Unnamed: 0,mean_test_score,std_test_score,param_preprocessor__num__imputer__strategy,param_classifier__C
0,0.783223,0.038166,mean,0.1
1,0.781319,0.038468,median,0.1
2,0.78131,0.032086,mean,1.0
4,0.780357,0.032223,mean,10.0
6,0.780357,0.032223,mean,100.0
5,0.779396,0.030362,median,10.0
7,0.779396,0.030362,median,100.0
3,0.778434,0.029904,median,1.0


The best hyper-parameters have be used to re-fit a final model on the full training set. We can evaluate that final model on held out test data that was not used for hyperparameter tuning.

In [12]:
print(
    (
        "best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best logistic regression from grid search: 0.798
