## Initial installs and imports


In [89]:
%%capture
%pip install tensorflow_decision_forests;
%pip install plotly
%pip install dtreeviz

In [90]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
import plotly.io as pio
import tensorflow_decision_forests as tfdf
import polars as pl
import plotly.express as px
import dtreeviz

In [91]:
train_csv = "/content/train.csv"
test_csv = "/content/test.csv"
pio.templates.default = "simple_white"

In [92]:
train = pl.read_csv(train_csv)
test = pl.read_csv(test_csv)

train = train.select(
    pl.all().name.map(lambda col_name: col_name.lower())
).with_row_index()

test = test.select(
    pl.all().name.map(lambda col_name: col_name.lower())
).with_row_index()


# Tensorflow


We'll use tensorflow's decision forests package to create a random forest of classifiers for this data. From there we'll convert our training and testing dataframes into keras datasets to fit the model.

In [93]:
train_pd = train.to_pandas().iloc[:, :].drop("name", axis=1)

train_pd[
    [
        "vip",
        "cryosleep",
        "foodcourt",
        "shoppingmall",
        "spa",
        "vrdeck",
        "roomservice",
        "age",
    ]
] = train_pd[
    [
        "vip",
        "cryosleep",
        "foodcourt",
        "shoppingmall",
        "spa",
        "vrdeck",
        "roomservice",
        "age",
    ]
].fillna(value=0)

train_pd[["transported", "vip", "cryosleep"]] = train_pd[
    ["transported", "vip", "cryosleep"]
].astype(int)

train_pd[["deck", "cabin_num", "side"]] = train_pd["cabin"].str.split("/", expand=True)
train_pd = train_pd.drop("cabin", axis=1)

train_pd["cabin_num"] = train_pd["cabin_num"].fillna("0")
for col in train_pd.select_dtypes(include=["object"]).columns:
    train_pd[col] = train_pd[col].fillna("")


x_train, x_val, y_train, y_val = train_test_split(
    train_pd, train_pd["transported"], test_size=0
)

# Convert dataframes to keras datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(x_train, label="transported")
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(x_val, label="transported")

In [94]:
train_pd["cabin_num"] = train_pd["cabin_num"].fillna("0")
train_pd["age"] = train_pd["age"].fillna(np.median(train_pd.age))
for col in train_pd.select_dtypes(include=["object"]).columns:
    train_pd[col] = train_pd[col].fillna("")

Now that we've initialized our data, let's use automated hyperparameter tuning to fit our ensemble to the data.

In [95]:
tuner = tfdf.tuner.RandomSearch(num_trials=20, use_predefined_hps=True)

rf = tfdf.keras.RandomForestModel(tuner=tuner, compute_oob_performances=True)
rf.fit(train_ds, verbose=2)
rf.compile(metrics=["accuracy"])

evaluation = rf.evaluate(valid_ds, return_dict=True)

Use /tmp/tmpch2zaac5 as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'homeplanet': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'cryosleep': <tf.Tensor 'data_1:0' shape=(None,) dtype=int64>, 'destination': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'age': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'vip': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'roomservice': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'foodcourt': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'shoppingmall': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'spa': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'vrdeck': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'deck': <tf.Tensor 'data_10:0' shape=(None,) dtype=string>, 'cabin_num': <tf.Tensor 'data_11:0' shape=(None,) dtype=string>, 'side': <tf.Tensor 'data_12:0' shape=(None,) dtype=string>}
Label: Tensor("data_13:0", shape=(None,), dtype=int64)
Weights: None

I0000 00:00:1727922209.465732     401 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1727922209.465779     401 kernel.cc:775] Collect training examples
I0000 00:00:1727922209.465800     401 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1727922209.465940     401 kernel.cc:394] Number of batches: 7
I0000 00:00:1727922209.465953     401 kernel.cc:395] Number of examples: 6954
I0000 00:00:1727922209.470392     401 data_spec_inference.cc:306] 1277 item(s) have been pruned (i.e. they are considered out of dictionary) for the column cabin_num (475 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
I0000 00:00:

Model trained in 1:03:49.728018
Compiling model...




Model compiled.


## Model interpretation

Now that we've built out our model let's assess the use of our features within it. We'll start with feature importances here.

In [96]:
logs = rf.make_inspector().training_logs()

fig = make_subplots(rows=1, cols=2, subplot_titles=("Accuracy", "Log-loss"))

trees = [log.num_trees for log in logs]
accuracy = [log.evaluation.accuracy for log in logs]
log_loss = [log.evaluation.loss for log in logs]

fig.add_trace(go.Scatter(x=trees, y=accuracy, name="Accuracy"), row=1, col=1)
fig.add_trace(go.Scatter(x=trees, y=log_loss, name="Log-loss"), row=1, col=2)
fig.update_xaxes(title="Number of trees")
fig.update_layout(title_text="RF Model Performance", height=650)

In [97]:
inspector = rf.make_inspector()
feature_importances = inspector.variable_importances()

feature_names = list()
importances = list()

for i in feature_importances["SUM_SCORE"]:
    feature_names.append(str(i[0][0]).split(" ")[0])
    importances.append(i[1])


fig = px.bar(x=sorted(importances), y=feature_names, title="RF model gain importances")
fig.update_layout(showlegend=False)
fig.update_layout(yaxis_title=None)
fig.update_layout(xaxis_title=None)
fig.show()


VIP, vrdeck, and destination are our most important features, with age, food court, and cryosleep being the least important gain importances for the model. Let's take a look the tree visualizations next.

In [98]:
tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=4)

Now that we've seen how the model performs on the training dataset, we can prepare data from the test set, predict the transported label, and create a csv file for submission.

In [99]:
test_pd = pd.read_csv(test_csv).rename(columns=str.lower)
passenger_ids = test_pd["passengerid"]

test_pd[["vip", "cryosleep"]] = test_pd[["vip", "cryosleep"]].fillna(value=0)

test_pd[["deck", "cabin_num", "side"]] = test_pd["cabin"].str.split("/", expand=True)
test_pd = test_pd.drop("cabin", axis=1)

test_pd["vip"] = test_pd["vip"].astype(int)
test_pd["cryosleep"] = test_pd["cryosleep"].astype(int)

test_pd_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_pd)

predictions = rf.predict(test_pd_ds)
boolean_predictions = (predictions > 0.5).astype(bool)


predictions_df = pd.DataFrame(
    {"PassengerId": passenger_ids, "Transported": boolean_predictions.squeeze()}
)

predictions_df.set_index("PassengerId", inplace=True)
predictions_df.to_csv("tensorflow_rf_submissions.csv")





This model reached a test accuracy of 0.79284, which vastly outperforms our other supervised learning models as well as the unsupervised learning models we tried.  

In the next cell, we'll save the model and export it as a zip file.

In [100]:
model_save_path = "/content/rf_model_10_1.tf"
rf.save(model_save_path)

import os, zipfile
from google.colab import files as colab_files


# 2. Create a zip file of the saved model directory
zip_path = "/content/rf_model_10_1.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_save_path):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(
                    os.path.join(root, file), os.path.join(model_save_path, "..")
                ),
            )

colab_files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Conclusion

In this Kaggle competition, we tried a few different approaches based off the starting assumption that ensemble models would give us a leg up on predicting labels for the blind test set. After our initial analysis and data cleaning classifiers were trained using TPOT, LGBM, XGB, CatBoost, and AdaBoost packages, before moving to stacking and voting classifiers.

In none of these cases did our models perform to the level needed. From there we moved to KNN and Label Spreading classifiers before having more success with the the Tensorflow Random Forest model.

This model reached our target accuracy, and is the final model used in this competition.

## Recommendations for further research

In future iterations of this project, a deeper dive into the use of other tensorflow models or more manually calibrated hyperparameters might give us a little more performance on the accuracy front. Additionally, we could explore how to chain supervised and unsupervised classifiers in order strengthen weaker classifiers similar to how we did with a stacking classifier in this project.

Lastly, a different approach could be to restrict models to only model types that can be feasibly trained without GPU resources, but require the same level of accuracy for passing submissions.