In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)


from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropFeatures

from cabin_encoder import CabinEncoder

In [57]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [58]:
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
xgb_params = {'subsample': 1.0, 
    'reg_lambda': 1.0, 
    'reg_alpha': 1.0, 
    'min_child_weight': 5, 
    'max_depth': 9, 
    'learning_rate': 0.2, 
    'gamma': 0.3, 
    'colsample_bytree': 0.7}

In [82]:
titanic_pipe = Pipeline([
    
    ('cabin_encoder', CabinEncoder(variables = ['Cabin'])),

    # add missing indicator
#     ('missing_indicator', AddMissingIndicator(variables='Age')),

    # impute numerical variables with the mean
    ('median_imputer', 
         MeanMedianImputer(imputation_method='median', variables='Age')),
    
    ('missing_imputation', CategoricalImputer(
        imputation_method='missing', variables='Cabin_Level')),
    
    ('frequent_imputation', CategoricalImputer(
        imputation_method='frequent', variables='Embarked')),
    
    ('drop_features', DropFeatures(
        features_to_drop=['Cabin', 'Name', 'PassengerId', 'Ticket'])),
    
    ('categorical_encoder', OneHotEncoder(variables = ['Sex','Cabin_Level', 'Embarked'])),
    
    ('xgboost', xgb.XGBClassifier(**xgb_params, random_state = 42))
    
])



In [83]:
titanic_pipe.fit(X_train, y_train)

In [85]:
y_pred = titanic_pipe.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8268156424581006
