##### Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

DATA_PATH="./data/"
test = pd.read_csv(DATA_PATH+"test.csv")
train = pd.read_csv(DATA_PATH+"train.csv")
data_full = pd.concat([train.drop(labels="Survived", axis=1), test]).reset_index(drop=True)

N_JOBS=-1
CV=5

##### Transforming

In [2]:
class GroupNames(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X.Name = pd.Series(data=X.Name.str.replace(".*, ", "", regex=True).str.replace(" .*", "", regex=True))
        X.Name.loc[X.Name.str.contains('Mlle.|the|Ms.|Lady.|Mme.')] = 'Mrs.'
        X.Name.loc[X.Name.str.contains('Dr.|Rev.|Major.|Col.|Capt.|Sir.|Don.|Jonkheer.')] = 'Mr.'
        return X

In [3]:
cat_pipe = Pipeline([
    ('grouper', GroupNames()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('encoder', OrdinalEncoder()),
])

In [4]:
num_pipe = Pipeline([
    ('imputer', KNNImputer(weights='distance')),
    ('kernel', None),
])

In [5]:
num_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_columns = ['Name', 'Sex', 'Embarked']

full_transform = ColumnTransformer([
    ('num_pipe', num_pipe, num_columns),
    ('cat_pipe', cat_pipe, cat_columns),
])            

In [6]:
full_pipe = Pipeline([
    ('transformer', full_transform),
    ('scaler', StandardScaler()),
    ('predictor', RandomForestClassifier(criterion='entropy', random_state=0, n_jobs=N_JOBS)),
])

In [8]:
param_grid = [{
    'transformer__num_pipe__kernel': [None, PolynomialFeatures(degree=1), PolynomialFeatures(degree=2)],
    'predictor__min_samples_leaf': [1, 2, 3, 4],
    'predictor__min_samples_split': [2, 3, 4, 5],
    'predictor__n_estimators': [100, 251, 25],
    'predictor__max_features': ['auto', 'sqrt', 'log2'],
}]
gs = GridSearchCV(full_pipe, param_grid, n_jobs=N_JOBS, cv=CV, return_train_score=True)
gs.fit(train.drop(labels=["Survived"], axis=1), train.Survived)

pd.set_option('display.max_colwidth', None)
pd.DataFrame(gs.cv_results_)[["params", "rank_test_score", "mean_test_score", "mean_train_score"]].sort_values(by='rank_test_score', axis=0).head(15)

#### All Results after Grid Search being applied on RandomForestRegressor SupportVectorRegressor and LogisticRegressor

{'predictor': SVC(random_state=0), 'transformer__num_pipe__kernel__degree': 1} 	1 	0.837242 	0.839786  
{'predictor': SVC(random_state=0), 'transformer__num_pipe__kernel__degree': 2} 	2 	0.831630 	0.840347  
{'predictor': RandomForestClassifier(random_state=0), 'transformer__num_pipe__kernel__degree': 1} 	3 	0.818191 	0.982323  
{'predictor': LogisticRegression(max_iter=200, random_state=0), 'transformer__num_pipe__kernel__degree': 2} 	4 	0.814808 	0.826599  
{'predictor': RandomForestClassifier(random_state=0), 'transformer__num_pipe__kernel__degree': 2} 	5 	0.811481 	0.982323  
{'predictor': LogisticRegression(max_iter=200, random_state=0), 'transformer__num_pipe__kernel__degree': 1} 	6 	0.793516 	0.804714  

{'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 3, 'rfc__n_estimators': 165, 'transformer__num_pipe__kernel__degree': 1} 	1 	0.841749 	0.918070  
{'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 4, 'rfc__n_estimators': 165, 'transformer__num_pipe__kernel__degree': 1} 	1 	0.841749 	0.918070  
{'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 3, 'rfc__n_estimators': 170, 'transformer__num_pipe__kernel__degree': 1} 	1 	0.841749 	0.917228  
{'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 4, 'rfc__n_estimators': 170, 'transformer__num_pipe__kernel__degree': 1} 	1 	0.841749 	0.917228  

{'lr__C': 1.6111111111111112, 'lr__tol': 1e-05, 'transformer__num_pipe__kernel': PolynomialFeatures()} 	1 	0.818172  
{'lr__C': 1.6111111111111112, 'lr__tol': 0.0001, 'transformer__num_pipe__kernel': PolynomialFeatures()} 	1 	0.818172  

{'predictor': RandomForestClassifier(criterion='entropy', min_samples_leaf=2, min_samples_split=4, n_estimators=170, n_jobs=-1, random_state=0), 'transformer__num_pipe__kernel': PolynomialFeatures(degree=1)} 	1 	0.841749 	0.917228  
{'predictor': SVC(random_state=0), 'transformer__num_pipe__kernel': None} 	2 	0.837242 	0.839786

{'predictor__max_features': 'log2', 'predictor__min_samples_leaf': 3, 'predictor__min_samples_split': 3, 'predictor__n_estimators': 100, 'transformer__num_pipe__kernel': None} 	1 	0.843990 	0.900114  
{'predictor__max_features': 'log2', 'predictor__min_samples_leaf': 3, 'predictor__min_samples_split': 5, 'predictor__n_estimators': 100, 'transformer__num_pipe__kernel': None} 	1 	0.843990 	0.900114  
{'predictor__max_features': 'log2', 'predictor__min_samples_leaf': 3, 'predictor__min_samples_split': 2, 'predictor__n_estimators': 100, 'transformer__num_pipe__kernel': None} 	1 	0.843990 	0.900114  
{'predictor__max_features': 'log2', 'predictor__min_samples_leaf': 3, 'predictor__min_samples_split': 4, 'predictor__n_estimators': 100, 'transformer__num_pipe__kernel': None} 	1 	0.843990 	0.900114  

##### Downloading (CSV)

In [9]:
result = pd.DataFrame(gs.predict(test))
result.rename(columns={0: "Survived"}, inplace=True)
result['PassengerId'] = list(range(892, 1310))
result.set_index('PassengerId', drop=True, inplace=True)
#result.to_csv('latest_prediction.csv')