# Spaceship Titanic: Tree Models
## Imports

In [1]:
import warnings
from pathlib import Path

import pandas as pd

In [2]:
warnings.simplefilter(action="ignore", category=FutureWarning)

## Read data

In [3]:
data_dir = Path.cwd().parent / "input" / "spaceship-titanic"
assert data_dir.exists(), f"directory doesn't exist: {data_dir}"

In [4]:
# Training data
df_train = pd.read_csv(data_dir / "train.csv")
df_train.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [5]:
# Test data
df_test = pd.read_csv(data_dir / "test.csv")
df_test.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
5,0027_01,Earth,False,F/7/P,TRAPPIST-1e,31.0,False,0.0,1615.0,263.0,113.0,60.0,Karlen Ricks
6,0029_01,Europa,True,B/2/P,55 Cancri e,21.0,False,0.0,,0.0,0.0,0.0,Aldah Ainserfle
7,0032_01,Europa,True,D/0/S,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,Acrabi Pringry
8,0032_02,Europa,True,D/0/S,55 Cancri e,23.0,False,0.0,0.0,0.0,0.0,0.0,Dhena Pringry
9,0033_01,Earth,False,F/7/S,55 Cancri e,24.0,False,0.0,639.0,0.0,0.0,0.0,Eliana Delazarson


## Create features from `PassengerId`

In [6]:
# Group
df_train["Group"] = df_train["PassengerId"].str.split("_", expand=True).iloc[:, 0]
df_test["Group"] = df_test["PassengerId"].str.split("_", expand=True).iloc[:, 0]

In [7]:
# GroupSize
df_train = df_train.join(
    df_train.groupby(by="Group").agg(GroupSize=pd.NamedAgg(column="PassengerId", aggfunc="count")),
    on="Group",
)
df_test = df_test.join(
    df_test.groupby(by="Group").agg(GroupSize=pd.NamedAgg(column="PassengerId", aggfunc="count")),
    on="Group",
)

In [8]:
# Set indexes
df_train = df_train.set_index("PassengerId", verify_integrity=True)
df_test = df_test.set_index("PassengerId", verify_integrity=True)

## Using the `Name` column

In [9]:
# Add Surname column
df_train = df_train.assign(Surname=df_train["Name"].str.split(" ", expand=True).iloc[:, 1])
df_test = df_test.assign(Surname=df_test["Name"].str.split(" ", expand=True).iloc[:, 1])

## Impute some missing values
Passengers who belong to the same group also come from the same home planet:

In [10]:
assert (
    df_train[df_train["HomePlanet"].notna()]
    .groupby("Group")
    .agg({"HomePlanet": "nunique"})
    .eq(1)
    .all(axis=None)
)
assert (
    df_test[df_test["HomePlanet"].notna()]
    .groupby("Group")
    .agg({"HomePlanet": "nunique"})
    .eq(1)
    .all(axis=None)
)

Using group data to impute some missing `HomePlanet` values:

In [11]:
# Training data
df_1 = (
    df_train.loc[df_train["GroupSize"].gt(1) & df_train["HomePlanet"].notna(), ["Group", "HomePlanet"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
query = "GroupSize > 1 and Group in @df_1.Group and HomePlanet.isna()"
df_2 = df_train.query(query).loc[:, ["Group"]].reset_index()
df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
df_train.loc[df_3.index, "HomePlanet"] = df_3["HomePlanet"]
del df_1, df_2, df_3

In [12]:
# Test data
df_1 = (
    df_test.loc[df_test["GroupSize"].gt(1) & df_test["HomePlanet"].notna(), ["Group", "HomePlanet"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_2 = df_test.query(query).loc[:, ["Group"]].reset_index()
df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
df_test.loc[df_3.index, "HomePlanet"] = df_3["HomePlanet"]
del df_1, df_2, df_3, query

Passengers with the same surname are from the same planet:

In [13]:
assert (
    df_train[["Surname", "HomePlanet"]]
    .dropna()
    .groupby("Surname")
    .agg({"HomePlanet": "nunique"})
    .eq(1)
    .all(axis=None)
)
assert (
    df_test[["Surname", "HomePlanet"]]
    .dropna()
    .groupby("Surname")
    .agg({"HomePlanet": "nunique"})
    .eq(1)
    .all(axis=None)
)

Use `Surname` to fill more missing `HomePlanet` values:

In [14]:
# Training data
df_sur_1 = (
    df_train[["Surname", "HomePlanet"]].dropna().groupby("Surname").agg({"HomePlanet": "first"}).reset_index()
)
query = "Surname.notna() and Surname in @df_sur_1.Surname and HomePlanet.isna()"
df_1 = df_train.query(query).loc[:, ["Surname"]].reset_index()
df_2 = df_1.merge(df_sur_1, on="Surname").drop(columns="Surname").set_index("PassengerId")
df_train.loc[df_2.index, "HomePlanet"] = df_2["HomePlanet"]
del df_1, df_2

In [15]:
# Test data

# To fix test data, I'll also use some training data. Combine all relevant data:
df_sur_2 = (
    df_test[["Surname", "HomePlanet"]].dropna().groupby("Surname").agg({"HomePlanet": "first"}).reset_index()
)
df_sur = pd.concat([df_sur_1, df_sur_2.query("Surname not in @df_sur_1.Surname")], ignore_index=True)
del df_sur_1, df_sur_2

In [16]:
query = query.replace("df_sur_1", "df_sur")
df_1 = df_test.query(query).loc[:, ["Surname"]].reset_index()
df_2 = df_1.merge(df_sur, on="Surname").drop(columns="Surname").set_index("PassengerId")
df_test.loc[df_2.index, "HomePlanet"] = df_2["HomePlanet"]
del df_1, df_2, df_sur, query