Want your grid search to run faster? Set n_jobs=-1 to use parallel processing with all CPUs!

In [1]:
import pandas as pd
df = pd.read_csv('train.csv')

In [2]:
cols=['Sex','Name','Age']
X=df[cols]
y=df['Survived']

In [3]:
from sklearn import set_config
set_config(display='diagram')

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [5]:
ct = ColumnTransformer(
    [('ohe', OneHotEncoder(), ['Sex']),
     ('vectorizer', CountVectorizer(), 'Name'),
     ('imputer', SimpleImputer(), ['Age'])])

In [10]:
clf = LogisticRegression(solver='liblinear', random_state=1)

In [11]:
pipe = Pipeline([('preprocessor', ct), ('classifier', clf)])

In [12]:
params = {}
params['preprocessor__ohe__drop'] = [None, 'first']
params['preprocessor__vectorizer__min_df'] = [1, 2, 3]
params['preprocessor__vectorizer__ngram_range'] = [(1, 1), (1, 2)]
params['classifier__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
params['classifier__penalty'] = ['l1', 'l2']

In [13]:
grid = GridSearchCV(pipe, params)
%time grid.fit(X, y)

CPU times: total: 14.8 s
Wall time: 14.8 s


In [14]:
grid = GridSearchCV(pipe, params, n_jobs=-1)
%time grid.fit(X, y)

CPU times: total: 1.8 s
Wall time: 7.74 s
