You can cross-validate and grid search an entire pipeline!

Preprocessing steps will automatically occur AFTER each cross-validation split, which is critical if you want meaningful scores.

In [8]:
import pandas as pd

df = pd.read_csv('train.csv', nrows=6)

In [9]:
cols = ['Sex', 'Name']
X = df[cols]
y = df['Survived']

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [11]:
ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'))

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [13]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, clf)

## Cross-validate the entire pipeline (not just the model)

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

1.0

## Find optimal tuning parameters for the entire pipeline

In [19]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

In [21]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=3, scoring='accuracy')
grid.fit(X, y);

In [22]:
# what was the best score found during the search?
grid.best_score_

1.0

In [23]:
# which combination of parameters produced the best score?
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 0.1,
 'logisticregression__penalty': 'l2'}