In [3]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
data = pd.read_csv("train.csv")

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

In [4]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
X = data.drop(['Survived','PassengerId','Ticket', 'Name'], axis=1)
y = data["Survived"]

In [6]:
X.select_dtypes(exclude = np.number)

Unnamed: 0,Sex,Cabin,Embarked
0,male,,S
1,female,C85,C
2,female,,S
3,female,C123,S
4,male,,S
...,...,...,...
886,male,,S
887,female,B42,S
888,female,,S
889,male,C148,C


In [7]:
numericDataColumn = X.select_dtypes(np.number).columns.values

In [8]:
categoryDataColumn = X.select_dtypes(exclude = np.number).columns.values

In [9]:
numericTransformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())])

categoryTransformer = Pipeline(steps=[
               ('imputer',SimpleImputer(strategy='most_frequent')),
               ('encoder',OneHotEncoder(handle_unknown='ignore'))])


In [10]:
preprocessor = ColumnTransformer(transformers=[
    ('num',numericTransformer, numericDataColumn),
    ('cat',categoryTransformer, categoryDataColumn)])

In [11]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                ('classifier', LogisticRegression())])

In [12]:
pipe.fit(X,y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  array(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype=object)),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                

In [15]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [41]:
test_ = pd.DataFrame([3,"male",32.0,0,0,7.7500,np.NaN,"Q"])

In [42]:
test_ = test_.T

In [43]:
test_.columns = X.columns

In [44]:
test_

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,32.0,0,0,7.75,,Q


In [45]:
pipe.predict(test_)

array([0])

In [69]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y,pipe.predict(X))

array([[481,  68],
       [ 86, 256]])

In [70]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(pipe, param_grid, cv=10)
grid_search

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

In [72]:
grid_search.fit(X, y)

print(f"Best params:")
print(grid_search.best_params_)





Best params:
{'classifier__C': 0.1, 'preprocessor__num__imputer__strategy': 'mean'}




In [74]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_preprocessor__num__imputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01386,0.002619,0.005144,0.000705,0.1,mean,"{'classifier__C': 0.1, 'preprocessor__num__imp...",0.822222,0.788889,0.775281,0.797753,0.808989,0.775281,0.775281,0.786517,0.831461,0.840909,0.800224,0.023158,1
4,0.017067,0.002069,0.006209,0.001599,10.0,mean,"{'classifier__C': 10, 'preprocessor__num__impu...",0.8,0.777778,0.797753,0.808989,0.808989,0.775281,0.786517,0.797753,0.853933,0.795455,0.800224,0.020947,1
1,0.01327,0.001101,0.004914,0.000535,0.1,median,"{'classifier__C': 0.1, 'preprocessor__num__imp...",0.811111,0.788889,0.775281,0.797753,0.820225,0.775281,0.786517,0.786517,0.808989,0.840909,0.799102,0.019931,3
5,0.014746,0.000886,0.005248,0.000452,10.0,median,"{'classifier__C': 10, 'preprocessor__num__impu...",0.8,0.777778,0.797753,0.808989,0.808989,0.775281,0.797753,0.797753,0.842697,0.784091,0.799102,0.018299,3
2,0.013468,0.001124,0.004822,0.000598,1.0,mean,"{'classifier__C': 1.0, 'preprocessor__num__imp...",0.8,0.788889,0.775281,0.808989,0.797753,0.775281,0.786517,0.797753,0.842697,0.806818,0.79798,0.018558,5
3,0.014332,0.002195,0.005339,0.001124,1.0,median,"{'classifier__C': 1.0, 'preprocessor__num__imp...",0.8,0.788889,0.775281,0.808989,0.797753,0.775281,0.786517,0.797753,0.853933,0.795455,0.79798,0.02122,5
6,0.016887,0.001052,0.005828,0.000417,100.0,mean,"{'classifier__C': 100, 'preprocessor__num__imp...",0.766667,0.777778,0.775281,0.764045,0.797753,0.786517,0.752809,0.786517,0.842697,0.795455,0.784512,0.023576,7
7,0.016855,0.000899,0.005894,0.000303,100.0,median,"{'classifier__C': 100, 'preprocessor__num__imp...",0.755556,0.777778,0.775281,0.775281,0.797753,0.786517,0.752809,0.786517,0.831461,0.784091,0.782267,0.02098,8


In [75]:
confusion_matrix(y,grid_search.predict(X))

array([[483,  66],
       [100, 242]])