In [None]:
!pip install catboost

# Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier, Pool
import numpy as np
random_state = 42
np.random.seed(random_state)

# Data

In [2]:
data = pd.read_csv('data/train.csv')
data.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S


# Cat num cols

In [3]:
target = 'Survived'
cols_to_drop = ['PassengerId', target]
num_cols = data.drop(columns=cols_to_drop).select_dtypes(include='number').columns.tolist()
cat_cols = [c for c in data.columns if c not in (cols_to_drop + num_cols)]
features_cols = cat_cols + num_cols
print(f'{num_cols=}')
print(f'{cat_cols=}')
assert len(features_cols) == len(set(features_cols))

num_cols=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols=['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [4]:
data[num_cols].sample(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
384,3,,0,0,7.8958
210,3,24.0,0,0,7.05
486,1,35.0,1,0,90.0


In [5]:
data[cat_cols].sample(3)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
254,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,370129,,S
884,"Sutehall, Mr. Henry Jr",male,SOTON/OQ 392076,,S
777,"Emanuel, Miss. Virginia Ethel",female,364516,,S


# Cols proc

In [6]:
preproc_pipe = ColumnTransformer([
        ('cat_cols', SimpleImputer(strategy='most_frequent'), cat_cols),
        ('num_cols', SimpleImputer(strategy='mean'), num_cols),
    ])
preproc_pipe

# Train test split

In [7]:
X, y = data[features_cols], data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 10), (179, 10), (712,), (179,))

In [8]:
X_train = pd.DataFrame(preproc_pipe.fit_transform(X_train[features_cols]), columns=features_cols)
X_test = pd.DataFrame(preproc_pipe.transform(X_test[features_cols]), columns=features_cols)

# Catboost clf

In [16]:
cat_boost_clf = CatBoostClassifier(cat_features=cat_cols)
cat_boost_clf.fit(X_train, y_train)

Learning rate set to 0.008911
0:	learn: 0.6879122	total: 73ms	remaining: 1m 12s
1:	learn: 0.6822067	total: 76.1ms	remaining: 38s
2:	learn: 0.6774549	total: 77.8ms	remaining: 25.9s
3:	learn: 0.6722791	total: 79.7ms	remaining: 19.8s
4:	learn: 0.6676668	total: 82.3ms	remaining: 16.4s
5:	learn: 0.6622442	total: 85.3ms	remaining: 14.1s
6:	learn: 0.6575080	total: 87ms	remaining: 12.3s
7:	learn: 0.6531164	total: 88.1ms	remaining: 10.9s
8:	learn: 0.6475794	total: 89.7ms	remaining: 9.87s
9:	learn: 0.6422378	total: 91.9ms	remaining: 9.09s
10:	learn: 0.6394689	total: 93.3ms	remaining: 8.39s
11:	learn: 0.6355192	total: 95.1ms	remaining: 7.83s
12:	learn: 0.6304791	total: 96.4ms	remaining: 7.32s
13:	learn: 0.6262801	total: 99.2ms	remaining: 6.99s
14:	learn: 0.6219462	total: 101ms	remaining: 6.63s
15:	learn: 0.6179533	total: 103ms	remaining: 6.32s
16:	learn: 0.6135807	total: 104ms	remaining: 6.04s
17:	learn: 0.6091568	total: 107ms	remaining: 5.83s
18:	learn: 0.6057593	total: 108ms	remaining: 5.59s
19

<catboost.core.CatBoostClassifier at 0x16ba01fa0>

In [18]:
print(classification_report(y_test, cat_boost_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       110
           1       0.78      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.80      0.80      0.80       179



In [28]:
submit_test = pd.read_csv('data/test.csv')
submit_test_proc = pd.DataFrame(preproc_pipe.transform(submit_test[features_cols]), columns=features_cols)
submit_test[target] = cat_boost_clf.predict(submit_test_proc)
submit_test.sample(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
270,1162,1,"McCaffry, Mr. Thomas Francis",male,46.0,0,0,13050,75.2417,C6,C,0
87,979,3,"Badman, Miss. Emily Louisa",female,18.0,0,0,A/4 31416,8.05,,S,0


In [30]:
submit_test[['PassengerId', 'Survived']].to_csv('data/submit_simple_imputer_catboost_clf.csv', index=False)
!du -hs 'data/submit_simple_imputer_catboost_clf.csv'
# Score: 0.78229

4.0K	data/submit_simple_imputer_catboost_clf.csv
