In [1]:
# modules
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

from jcopml.tuning import random_search_params as rsp
from jcopml.tuning.space import Integer, Real

from Levenshtein import distance as lev

In [28]:
# df = pd.read_csv('../input/data/train.csv', index_col='PassengerId')
# df = pd.read_csv('../input/data/test.csv', index_col='PassengerId')
df = pd.read_csv('data/train.csv', index_col='PassengerId')

In [29]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
df.shape

(891, 11)

In [31]:
df['Title'] = df.Name.apply(lambda x: re.findall(' ([a-zA-Z]+)\.', x)[0])
df['Title'] = df.Title.replace(['Major', 'Sir', 'Jonkheer', 'Dr','Col','Don', 'Capt','Rev'], 'Mr')
df['Title'] = df.Title.replace(['Ms','Lady', 'Countess','Dona'], 'Mrs')
df['Title'] = df.Title.replace(['Mme','Mlle'], 'Miss')

In [32]:
df['Age'] = df.groupby(['Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.mean()))

In [33]:
df['Embarked'] = df['Embarked'].fillna('S')

In [34]:
df['Cabin'] = df['Cabin'].fillna('N')
df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
df['Cabin'] = df['Cabin'].replace('T', 'A')
df['Cabin'] = df['Cabin'].apply(lambda x: 0 if x in ['A', 'B', 'C', 'D','E', 'F', 'G'] else 1)

In [35]:
df.Cabin.unique()

array([1, 0], dtype=int64)

In [8]:
df.drop(columns=['Name', 'Ticket'], inplace=True)

# Dataset Splitting

In [9]:
X = df.drop(columns='Survived')
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

# Preprocessor

In [26]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['SibSp', 'Parch', 'Fare']),
    ('categoric', categorical_pipeline, ['Pclass', 'Sex', 'SibSp', 'Embarked', 'Title'])
])

- TODO:
    - Tambahin function buat interfal value parameternya

# RF

In [30]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))
])

parameter = {
    'prep__numeric__poly__degree': Integer(low=2, high=5),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__n_estimators': Integer(low=100, high=100),
    'algo__max_depth': Integer(low=45, high=60),
    'algo__max_features': Real(low=0.5, high=0.6, prior='uniform'),
    'algo__min_samples_leaf': Integer(low=10, high=13)
}

In [33]:
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Test score  : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    9.5s finished



Best params : {'algo__max_depth': 51, 'algo__max_features': 0.5648663826615089, 'algo__min_samples_leaf': 10, 'algo__n_estimators': 100, 'prep__numeric__poly__degree': 2, 'prep__numeric__poly__interaction_only': False}
Best score  : 0.8188195109267337
Train score : 0.8665730337078652
Test score  : 0.8435754189944135



In [21]:
# fourth submit
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Test score  : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    9.2s finished



Best params : {'algo__max_depth': 48, 'algo__max_features': 0.5768062890209851, 'algo__min_samples_leaf': 10, 'algo__n_estimators': 100, 'prep__numeric__poly__degree': 3, 'prep__numeric__poly__interaction_only': True}
Best score  : 0.8216383599853444
Train score : 0.8525280898876404
Test score  : 0.8435754189944135



### XGBoosting

In [34]:
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])

parameter = {
    'prep__numeric__poly__degree': Integer(low=1, high=4),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=1, high=10),
    'algo__learning_rate': Real(low=-1, high=1, prior='log-uniform'),
    'algo__n_estimators': Integer(low=150, high=200),
    'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
    'algo__gamma': Integer(low=1, high=10),
    'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
    'algo__reg_alpha': Real(low=0, high=0.3, prior='log-uniform'),
    'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')
}

In [None]:
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])

parameter = {
    'prep__numeric__poly__degree': [2],
    'prep__numeric__poly__interaction_only': [False],
    'algo__colsample_bytree': [0.6336897488208131],
    'algo__gamma': [6],
    'algo__learning_rate': [1.261652738713559],
    'algo__max_depth': [6],
    'algo__n_estimators': [183],
    'algo__reg_alpha': [1.818124622936793],
    'algo__reg_lambda': [0.029104104586515068],
    'algo__subsample': [0.6113613002552669]
}

In [44]:
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=10, n_jobs=-1, verbose=5)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Train score : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  22 out of  30 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.6s finished



Best params : {'algo__colsample_bytree': 0.6336897488208131, 'algo__gamma': 6, 'algo__learning_rate': 1.261652738713559, 'algo__max_depth': 6, 'algo__n_estimators': 183, 'algo__reg_alpha': 1.818124622936793, 'algo__reg_lambda': 0.029104104586515068, 'algo__subsample': 0.6113613002552669, 'prep__numeric__poly__degree': 2, 'prep__numeric__poly__interaction_only': False}
Best score  : 0.8174307697762649
Train score : 0.848314606741573
Train score : 0.8435754189944135



In [169]:
# third submit
XGb = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=5)
XGb.fit(X_train, y_train)

print(f'''
Best params : {XGb.best_params_}
Best score  : {XGb.best_score_}
Train score : {XGb.score(X_train, y_train)}
Train score : {XGb.score(X_test, y_test)}
''')

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 135 out of 150 | elapsed:    2.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.2s finished



Best params : {'algo__colsample_bytree': 0.5865413989861701, 'algo__gamma': 3, 'algo__learning_rate': 0.25178887430355257, 'algo__max_depth': 8, 'algo__n_estimators': 200, 'algo__reg_alpha': 1.198380423419617, 'algo__reg_lambda': 0.0048018718482369, 'algo__subsample': 0.49151551034421803, 'prep__numeric__poly__degree': 3, 'prep__numeric__poly__interaction_only': True}
Best score  : 0.8300653594771242
Train score : 0.8356741573033708
Train score : 0.8156424581005587



In [12]:
# second submit
%%time 
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=5)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Train score : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:   36.0s finished



Best params : {'algo__max_depth': 20, 'algo__max_features': 0.6, 'algo__min_samples_leaf': 10, 'algo__n_estimators': 200}
Best score  : 0.8230448297462445
Train score : 0.851123595505618
Train score : 0.8212290502793296

Wall time: 36.9 s


### CatBoosting

In [28]:
from catboost import CatBoostClassifier

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', CatBoostClassifier(iterations=1000,
                                random_strength=1,
                                subsample=0.76,
                                eval_metric='Accuracy',
                                loss_function= 'Logloss',
                                #bootstrap_type= 'Bernoulli',
                                bagging_temperature=0.80, 
#                                 cat_features=cats,
                                verbose=100
                               ))
    ])

parameter = {
    'prep__numeric__poly__degree': Integer(low=4, high=6),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=1, high=10),
    'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
}

In [37]:
%%time
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=5, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Test score  : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.6min finished


0:	learn: 0.8244382	total: 2.5ms	remaining: 2.49s
100:	learn: 0.9339888	total: 146ms	remaining: 1.3s
200:	learn: 0.9367978	total: 303ms	remaining: 1.2s
300:	learn: 0.9367978	total: 451ms	remaining: 1.05s
400:	learn: 0.9367978	total: 597ms	remaining: 892ms
500:	learn: 0.9367978	total: 762ms	remaining: 759ms
600:	learn: 0.9367978	total: 936ms	remaining: 621ms
700:	learn: 0.9367978	total: 1.1s	remaining: 469ms
800:	learn: 0.9367978	total: 1.24s	remaining: 307ms
900:	learn: 0.9367978	total: 1.38s	remaining: 151ms
999:	learn: 0.9367978	total: 1.52s	remaining: 0us

Best params : {'algo__learning_rate': 0.3702520371721318, 'algo__max_depth': 5, 'prep__numeric__poly__degree': 6, 'prep__numeric__poly__interaction_only': True}
Best score  : 0.8047961800754057
Train score : 0.9367977528089888
Test score  : 0.7932960893854749

Wall time: 1min 38s


### save model

In [54]:
import os, pickle

In [55]:
os.makedirs('model', exist_ok=True)
pickle.dump(model, open('model/titanic.pkl', 'wb'))

### submission

In [45]:
predict = pd.read_csv('data/test.csv', index_col='PassengerId')

In [46]:
predict['Title'] = predict.Name.apply(lambda x: re.findall(' ([a-zA-Z]+)\.', x)[0])
predict['Title'] = predict.Title.replace(['Major', 'Sir', 'Jonkheer', 'Dr','Col','Don', 'Capt','Rev'], 'Mr')
predict['Title'] = predict.Title.replace(['Ms','Lady', 'Countess','Dona'], 'Mrs')
predict['Title'] = predict.Title.replace(['Mme','Mlle'], 'Miss')

In [47]:
predict['Age'] = predict.groupby(['Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.mean()))

In [48]:
predict['Embarked'] = predict['Embarked'].fillna('S')

In [49]:
predict['Fare'] = predict.groupby(['Pclass', 'Title'])['Fare'].apply(lambda x: x.fillna(x.mean()))

In [50]:
predict.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [51]:
submission = pd.read_csv('data/gender_submission.csv')
submission['Survived'] = model.predict(predict)

In [52]:
submission.to_csv('submission.csv', index=False)

In [53]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
