In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('train.csv')

In [5]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
657,658,0,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q
84,85,1,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5,,S
278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q
834,835,0,3,"Allum, Mr. Owen George",male,18.0,0,0,2223,8.3,,S
485,486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S


In [7]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [16]:
X = df.drop(columns='Survived')
y = df['Survived']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Survived'), df['Survived'], test_size=0.2)

In [24]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
452,1,male,30.0,0,0,27.75,C
475,1,male,,0,0,52.0,S
724,1,male,27.0,1,0,53.1,S
805,3,male,31.0,0,0,7.775,S
625,1,male,61.0,0,0,32.3208,S


In [30]:
numerical_features = ['Age', 'Fare']
numerica_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

In [34]:
categorical_features = ['Embarked', 'Sex']
categorical_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

In [68]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerica_transform, numerical_features),
    ('cat', categorical_transform, categorical_features)
])

In [70]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

In [72]:
from sklearn import set_config

set_config(display='diagram')
clf

In [90]:

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)

In [94]:
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

{'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


In [96]:
print(grid_search.best_score_)

0.786443661971831
