# Import

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier, Pool
import numpy as np
random_state = 42
np.random.seed(random_state)

In [2]:
df = pd.read_csv('data/train.csv')
df.columns = map(str.lower, df.columns.tolist())
df.sample(2)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S


# Feature engineering

In [3]:
name_sub_list = {
    'Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
    'Don', 'Jonkheer'
}
def name2title(row):
    kw = None
    for sub in name_sub_list:
        if sub.lower() in row['name'].lower():
            kw = sub
            break
    if kw is None:
        return np.nan
    if kw in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    if kw in ['Countess', 'Mme']:
        return 'Mrs'
    if kw in ['Mlle', 'Ms']:
        return 'Miss'
    if kw =='Dr':
        if row.sex =='Male':
            return 'Mr'
        else:
            return 'Mrs'
    return kw

def preproc(df):
    df['title'] = df.apply(name2title, axis=1)
    df['deck'] = df.cabin.apply(lambda x: 'hz' if x is np.nan else x[0])
    df['family_size'] = df.sibsp + df.parch
    df['fare_per_family'] = df.fare / df.family_size
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df['age'] = df.age.fillna(df.age.mean()) # mean
    df['cabin'] = df.cabin.fillna('hz') # unknown
    df['embarked'] = df.embarked.fillna(df.embarked.value_counts().index[0]) # moda
    df['fare_per_family'] = df.fare_per_family.fillna(df.fare_per_family.mean()) # mean
    df.isna().sum()
    return df

df = preproc(df)
df.isna().sum()

passengerid        0
survived           0
pclass             0
name               0
sex                0
age                0
sibsp              0
parch              0
ticket             0
fare               0
cabin              0
embarked           0
title              0
deck               0
family_size        0
fare_per_family    0
dtype: int64

In [4]:
target = 'survived'
cols_to_drop = ['passengerid', target] + ['name', 'ticket', 'cabin']
num_cols = df.drop(columns=cols_to_drop).select_dtypes(include='number').columns.tolist()
cat_cols = df.drop(columns=cols_to_drop).select_dtypes(exclude='number').columns.tolist()
features_order = cat_cols + num_cols
print(f'{num_cols=}')
print(f'{cat_cols=}')
assert len(features_order) == len(set(features_order))

num_cols=['pclass', 'age', 'sibsp', 'parch', 'fare', 'family_size', 'fare_per_family']
cat_cols=['sex', 'embarked', 'title', 'deck']


In [5]:
df[num_cols].sample(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,family_size,fare_per_family
384,3,29.699118,0,0,7.8958,0,31.85729
210,3,24.0,0,0,7.05,0,31.85729
486,1,35.0,1,0,90.0,1,90.0


In [6]:
df[cat_cols].sample(3)

Unnamed: 0,sex,embarked,title,deck
254,female,S,Mr,hz
884,male,S,Mr,hz
777,female,S,Miss,hz


# Train test split

In [7]:
X, y = df[features_order], df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 11), (179, 11), (712,), (179,))

In [9]:
col_proc = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
preproc_pipe = Pipeline(steps=[
    ('column_transform', col_proc),
])
preproc_pipe

In [10]:
df[cat_cols].nunique()

sex         2
embarked    3
title       4
deck        9
dtype: int64

In [11]:
X_train = preproc_pipe.fit_transform(X_train)
X_test = preproc_pipe.transform(X_test)
X_train.shape, X_test.shape

((712, 25), (179, 25))

In [12]:
%%time
cat_boost_clf = CatBoostClassifier(learning_rate=.0001)
cat_boost_clf.fit(X_train, y_train, verbose=1)

0:	learn: 0.6930926	total: 58ms	remaining: 58s
1:	learn: 0.6930181	total: 59.4ms	remaining: 29.6s
2:	learn: 0.6929536	total: 60.4ms	remaining: 20.1s
3:	learn: 0.6928888	total: 61.2ms	remaining: 15.2s
4:	learn: 0.6928299	total: 62ms	remaining: 12.3s
5:	learn: 0.6927585	total: 63ms	remaining: 10.4s
6:	learn: 0.6926846	total: 63.9ms	remaining: 9.06s
7:	learn: 0.6926207	total: 64.7ms	remaining: 8.02s
8:	learn: 0.6925555	total: 65.7ms	remaining: 7.24s
9:	learn: 0.6924904	total: 66.5ms	remaining: 6.59s
10:	learn: 0.6924177	total: 67.3ms	remaining: 6.05s
11:	learn: 0.6923580	total: 67.9ms	remaining: 5.59s
12:	learn: 0.6922887	total: 68.7ms	remaining: 5.22s
13:	learn: 0.6922185	total: 69.5ms	remaining: 4.89s
14:	learn: 0.6921557	total: 70.2ms	remaining: 4.61s
15:	learn: 0.6920872	total: 71ms	remaining: 4.37s
16:	learn: 0.6920482	total: 71.4ms	remaining: 4.13s
17:	learn: 0.6919780	total: 72.3ms	remaining: 3.94s
18:	learn: 0.6919090	total: 73.2ms	remaining: 3.78s
19:	learn: 0.6918460	total: 74.2

<catboost.core.CatBoostClassifier at 0x16be1f680>

In [34]:
clf = CatBoostClassifier()
params = {'iterations': [500, 1000],
          'depth': [4, 5, 6, 8],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10, 15],
          'eval_metric': ['Accuracy'],
          # 'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [random_state]
         }
scorer = make_scorer(accuracy_score)
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=5)

In [35]:
%%time
clf_grid.fit(X_train, y_train)
best_param = clf_grid.best_params_
best_param

CPU times: user 7min 9s, sys: 8min 26s, total: 15min 36s
Wall time: 4min 5s


{'depth': 4,
 'eval_metric': 'Accuracy',
 'iterations': 500,
 'l2_leaf_reg': 1e-20,
 'leaf_estimation_iterations': 10,
 'logging_level': 'Silent',
 'loss_function': 'Logloss',
 'random_seed': 42}

In [36]:
cat_boost_clf = CatBoostClassifier(**best_param)
cat_boost_clf.fit(X_train, y_train, verbose=1)
print(classification_report(y_test, cat_boost_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       110
           1       0.74      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



In [44]:
submit_test = pd.read_csv('data/test.csv')
submit_test.columns = map(str.lower, submit_test.columns.tolist())
submit_test_proc = preproc(submit_test.copy())[features_order]
submit_test_proc = preproc_pipe.transform(submit_test_proc)
submit_test[target] = cat_boost_clf.predict(submit_test_proc)
submit_test.sample(2)

Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,0
259,1151,3,"Midtsjo, Mr. Karl Albert",male,21.0,0,0,345501,7.775,,S,0


In [45]:
submit_test.survived.value_counts()

survived
0    282
1    136
Name: count, dtype: int64

In [46]:
submit_test.rename(columns={'PassengerId'.lower(): 'PassengerId', 'Survived'.lower(): 'Survived'})[['PassengerId', 'Survived']].to_csv('data/submit_encoder_feature_eng_catboost_clf.csv', index=False)
!du -hs 'data/submit_encoder_feature_eng_catboost_clf.csv'

4.0K	data/submit_encoder_feature_eng_catboost_clf.csv
