In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('train_manual.csv')
test_data = pd.read_csv('test_manual.csv')

In [3]:
cat_features = [f for f in test_data if not pd.api.types.is_numeric_dtype(test_data[f])]
num_features = [f for f in test_data if f not in cat_features]

In [4]:
default_values = {f: 'unknown' for f in cat_features}
default_values.update({f: -1 for f in num_features})

In [5]:
# Add a total spending feature:
test_data = test_data.assign(
    spending = test_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1),
)
train_data = train_data.assign(
    spending = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1),
)
num_features.append('spending')

In [6]:
test_data.fillna(default_values, inplace=True)
train_data.fillna(default_values, inplace=True)

In [7]:
X_train, X_validation, y_train, y_validation = train_test_split(train_data[[c for c in train_data if c != 'Transported']], train_data['Transported'], train_size=0.75, random_state=42)

In [8]:
model = CatBoostClassifier(
    iterations=1000,
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [9]:
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    plot=True,
    use_best_model=True,
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [10]:
pd.DataFrame(model.predict(test_data), columns=['Transported']).join(test_data.PassengerId).set_index('PassengerId', drop=True).to_csv('submission_catboost.csv')