In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib

import sklearn.pipeline
import sklearn.compose
import sklearn.preprocessing
import sklearn.feature_extraction.text

In [None]:
xgb.__version__

In [None]:
! ls ../data

In [None]:
! ls ../data/titanic

In [None]:
df = pd.read_csv("../data/titanic/train.csv")
df

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

## Preprocess

In [None]:
df.hist()

In [None]:
df.loc[df['Cabin'].isnull(), 'Cabin'] = 'None'

In [None]:
preprocessor = sklearn.compose.ColumnTransformer(
    transformers=[
        ('Pclass',   sklearn.preprocessing.FunctionTransformer(), ["Pclass"]), # identity
        ('Sex',      sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'), ["Sex"]),
        ('Age',      sklearn.preprocessing.MinMaxScaler(), ["Age"]),
        ('SibSp',    sklearn.preprocessing.QuantileTransformer(), ["SibSp"]),
        ('Parch',    sklearn.preprocessing.FunctionTransformer(), ["Parch"]),
        ('Fare',     sklearn.preprocessing.Normalizer(), ["Fare"]),
        ('Cabin',    sklearn.preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=9999), ["Cabin"]),
        ('Embarked', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'), ['Embarked']),
    ],
)
tr = preprocessor.fit(df)

In [None]:
preprocessed_df = tr.transform(df)
preprocessed_df[:10]

In [None]:
preprocessed_df.shape

## Fit

In [None]:
clf = xgb.XGBModel(**{'objective':'binary:logistic', 'n_estimators':10})

model = clf.fit(preprocessed_df, df['Survived'])

## Test

In [None]:
y_pred = clf.predict(preprocessed_df)

In [None]:
res = df['Survived'] - y_pred
res.abs().mean()

## Save Model

In [None]:
joblib.dump(preprocessor, '../data/models/titanic_preprocessor.sklearn')
clf.save_model("../data/models/titanic.xgb")

In [None]:
# test loaded model
preprocessor2 = joblib.load('../data/models/titanic_preprocessor.sklearn')
                            
clf2 = xgb.XGBModel(**{'objective':'binary:logistic', 'n_estimators':10})
clf2.load_model("../data/models/titanic.xgb")

In [None]:
y_pred2 = clf2.predict(preprocessor2.transform(df))

res = df['Survived'] - y_pred
res.abs().mean()