# Predicting Survivors on the Titanic
Accuracy is the evaluation metric

In [1]:
import numpy as np
import pandas as pd

import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Read data files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
# Split training data
from sklearn.model_selection import train_test_split

X = df_train.drop(columns=['PassengerId', 'Survived'])
y = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Data Wrangling

### Check Feature Cardinality

In [6]:
df_train[['Name', 'Ticket', 'Cabin']].nunique()

Name      891
Ticket    681
Cabin     147
dtype: int64

Drop ***Name*** feature because it has all unique values and drop ***Cabin*** feature since it has too many missing values

In [7]:
features_to_drop = ['Name', 'Cabin']

### Imputing Missing Values and Categorical Feature Encoding

In [8]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.pipeline import Pipeline, make_pipeline

In [9]:
embarked_pipe = Pipeline([('simpleimputer', SimpleImputer(strategy='most_frequent')),
                          ('targetencoder', TargetEncoder())])

In [10]:
# Make preprocessing transformer with drop, imputer, and encoder steps
from sklearn.compose import ColumnTransformer, make_column_transformer

preprocessing_transformer = ColumnTransformer(
                                transformers=[
                                    ('dropcolumns', 'drop', features_to_drop),
                                    ('agefare', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
                                    ('ticket', TargetEncoder(), ['Sex', 'Ticket']),
                                    ('embarked', embarked_pipe, ['Embarked'])],
                                    remainder='passthrough')

## Train Model and Cross-Validate

In [15]:
from catboost import CatBoostClassifier

catboost_pipe = make_pipeline(preprocessing_transformer,
                              CatBoostClassifier(random_seed=42, verbose=False))

In [17]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(catboost_pipe, X_train, y_train, scoring='accuracy', cv=5)

# Calculate mean and standard deviation of scores
avg = cv_scores.mean()
stddev = cv_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in cv_scores], '\n')
print(f"Mean: {avg:.4f}")
print(f"Std. Dev: {stddev:.4f}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Scores: [0.8112, 0.8322, 0.8028, 0.8169, 0.8521] 

Mean: 0.8230
Std. Dev: 0.0174
+/-2 std. dev. range within mean: (0.7882, 0.8579)


## Evaluate Performance on X_test

In [18]:
from sklearn.metrics import classification_report

catboost_pipe.fit(X_train, y_train)
y_pred = catboost_pipe.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.85       105
           1       0.83      0.72      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



## Feature Selection

In [19]:
# Use RFE to identify the most relevant features
from sklearn.feature_selection import RFE

In [20]:
catboost_clf = CatBoostClassifier(random_seed=42, verbose=False)

rfe_catboost_pipe = make_pipeline(preprocessing_transformer,
                              RFE(estimator=catboost_clf, n_features_to_select=4),
                              catboost_clf)

In [21]:
# Cross-validate with pipeline that has RFE
cv_scores = cross_val_score(rfe_catboost_pipe, X_train, y_train, scoring='accuracy', cv=5)

# Calculate mean and standard deviation of scores
avg = cv_scores.mean()
stddev = cv_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in cv_scores], '\n')
print(f"Mean: {avg:.4f}")
print(f"Std. Dev: {stddev:.4f}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Scores: [0.8182, 0.8392, 0.8099, 0.8169, 0.831] 

Mean: 0.8230
Std. Dev: 0.0106
+/-2 std. dev. range within mean: (0.8019, 0.8441)


## Evaluate Performance with RFE on X_test

In [26]:
rfe_catboost_pipe.fit(X_train, y_train)
y_pred = rfe_catboost_pipe.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.70      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



## Hyperparameter Tuning

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [30]:
tuning_catboost_pipe = make_pipeline(preprocessing_transformer,
                              RFE(estimator=catboost_clf),
                              catboost_clf)

tuning_catboost_pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dropcolumns', 'drop',
                                                  ['Name', 'Cabin']),
                                                 ('agefare', SimpleImputer(),
                                                  ['Age', 'Fare']),
                                                 ('ticket', TargetEncoder(),
                                                  ['Sex', 'Ticket']),
                                                 ('embarked',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('targetencoder',
                                                                   TargetEncoder())]),
                                        

In [31]:
rs_params = {'rfe__n_features_to_select': uniform(0.0, 1.0),
            'catboostclassifier__l2_leaf_reg': uniform(0.001, 10),
            'catboostclassifier__learning_rate': uniform(0.01, 1.0),
            'catboostclassifier__depth': randint(1, 11),
            'catboostclassifier__max_leaves': randint(1, 100)}

In [32]:
catboost_rs = RandomizedSearchCV(tuning_catboost_pipe, param_distributions=rs_params, n_iter=100, 
                                 scoring='f1_macro', cv=5, n_jobs=-1, return_train_score=True)

catboost_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('dropcolumns',
                                                                               'drop',
                                                                               ['Name',
                                                                                'Cabin']),
                                                                              ('agefare',
                                                                               SimpleImputer(),
                                                                               ['Age',
                                                                                'Fare']),
                                                                              ('ticket',
       

In [33]:
avg = catboost_rs.best_score_
stddev = catboost_rs.cv_results_['std_test_score'][catboost_rs.best_index_]

print(f"Best Hyperparameters: {catboost_rs.best_params_}'\n'")
print(f"Best Mean Score: {avg:.4f}")
print(f"Best Mean Std. Dev.: {stddev:.4f}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Best Hyperparameters: {'catboostclassifier__depth': 6, 'catboostclassifier__l2_leaf_reg': 0.8727605161677455, 'catboostclassifier__learning_rate': 0.4633474498674284, 'catboostclassifier__max_leaves': 64, 'rfe__n_features_to_select': 0.9991170658241365}'
'
Best Mean Score: 0.7896
Best Mean Std. Dev.: 0.0226
+/-2 std. dev. range within mean: (0.7443, 0.8348)


## Retrain with Entire Training Set

In [35]:
catboost_rs.fit(X, y)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('dropcolumns',
                                                                               'drop',
                                                                               ['Name',
                                                                                'Cabin']),
                                                                              ('agefare',
                                                                               SimpleImputer(),
                                                                               ['Age',
                                                                                'Fare']),
                                                                              ('ticket',
       

## Kaggle Submission

In [36]:
submission_preds = catboost_rs.predict(df_test.drop(columns='PassengerId'))
pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': submission_preds}).to_csv('titanic_submission.csv', index=False)