# Spaceship Titanic: Data Preparation
## Imports

In [1]:
import warnings
from pathlib import Path
from typing import cast

import numpy as np
import pandas as pd
from IPython.display import display
from pandas.testing import assert_frame_equal
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder, PowerTransformer

In [2]:
warnings.simplefilter(action="ignore", category=FutureWarning)

## Read data

In [3]:
data_dir = Path.cwd().parent / "input" / "spaceship-titanic"
assert data_dir.exists(), f"directory doesn't exist: {data_dir}"

In [4]:
# Training data
df_train = pd.read_csv(data_dir / "train.csv")
df_train.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [5]:
# Test data
df_test = pd.read_csv(data_dir / "test.csv")
df_test.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
5,0027_01,Earth,False,F/7/P,TRAPPIST-1e,31.0,False,0.0,1615.0,263.0,113.0,60.0,Karlen Ricks
6,0029_01,Europa,True,B/2/P,55 Cancri e,21.0,False,0.0,,0.0,0.0,0.0,Aldah Ainserfle
7,0032_01,Europa,True,D/0/S,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,Acrabi Pringry
8,0032_02,Europa,True,D/0/S,55 Cancri e,23.0,False,0.0,0.0,0.0,0.0,0.0,Dhena Pringry
9,0033_01,Earth,False,F/7/S,55 Cancri e,24.0,False,0.0,639.0,0.0,0.0,0.0,Eliana Delazarson


## New features from `PassengerId`

In [6]:
# Group
df_train["Group"] = df_train.PassengerId.str.split("_", expand=True).iloc[:, 0].astype("category")
df_test["Group"] = df_test.PassengerId.str.split("_", expand=True).iloc[:, 0].astype("category")

In [7]:
# Alone and CompanionCount

# Training data
df_train = (
    df_train.join(
        df_train.groupby(by="Group").PassengerId.count().rename("GroupSize"),
        on="Group",
    )
    .assign(
        Alone=lambda x: x.GroupSize == 1,
        CompanionCount=lambda x: x.GroupSize - 1,
    )
    .drop(columns="GroupSize")
)

In [8]:
# Test data
df_test = (
    df_test.join(
        df_test.groupby(by="Group").PassengerId.count().rename("GroupSize"),
        on="Group",
    )
    .assign(
        Alone=lambda x: x.GroupSize == 1,
        CompanionCount=lambda x: x.GroupSize - 1,
    )
    .drop(columns="GroupSize")
)

In [9]:
# Set indexes
df_train = df_train.set_index("PassengerId", verify_integrity=True)
df_test = df_test.set_index("PassengerId", verify_integrity=True)

In [10]:
# Combine infrequent values of CompanionCount
df_train = df_train.assign(
    CompCntReduced=df_train.CompanionCount.transform(lambda x: np.where(x > 2, "3+", str(x)))
).drop(columns="CompanionCount")

df_test = df_test.assign(
    CompCntReduced=df_test.CompanionCount.transform(lambda x: np.where(x > 2, "3+", str(x)))
).drop(columns="CompanionCount")

## Impute some missing values of `HomePlanet`

In [11]:
# Passengers who belong to the same group also come from the same home planet
df_test.loc[df_test.HomePlanet.notna(), ["Group", "HomePlanet"]].groupby(
    "Group", observed=True
).HomePlanet.nunique().eq(1).all()

True

In [12]:
# Number of missing values BEFORE
print(f"Training data: {df_train.HomePlanet.isna().sum()}")
print(f"Test data: {df_test.HomePlanet.isna().sum()}")

Training data: 201
Test data: 87


In [13]:
# Training data
df_1 = (
    df_train.loc[(df_train.Alone == False) & df_train.HomePlanet.notna(), ["Group", "HomePlanet"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_2 = df_train.loc[
    (df_train.Alone == False) & df_train.Group.isin(df_1.Group) & df_train.HomePlanet.isna(), ["Group"]
].reset_index(drop=False)

df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
# display(df_3.head(20))

df_train.loc[df_3.index, "HomePlanet"] = df_3.HomePlanet
# display(df_train.head(20))

del df_1, df_2, df_3

In [14]:
# Test data
df_1 = (
    df_test.loc[(df_test.Alone == False) & df_test.HomePlanet.notna(), ["Group", "HomePlanet"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_2 = df_test.loc[
    (df_test.Alone == False) & df_test.Group.isin(df_1.Group) & df_test.HomePlanet.isna(), ["Group"]
].reset_index(drop=False)

df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
# display(df_3.head(20))

df_test.loc[df_3.index, "HomePlanet"] = df_3.HomePlanet
# display(df_test.head(20))

del df_1, df_2, df_3

In [15]:
# Number of missing values AFTER
print(f"Training data: {df_train.HomePlanet.isna().sum()}")
print(f"Test data: {df_test.HomePlanet.isna().sum()}")

Training data: 111
Test data: 46


## Using the `Name` column

In [16]:
# Add Surname column
df_train = df_train.assign(Surname=df_train.Name.str.split(" ", expand=True).iloc[:, 1]).drop(columns="Name")
df_test = df_test.assign(Surname=df_test.Name.str.split(" ", expand=True).iloc[:, 1]).drop(columns="Name")

In [17]:
# Passengers with the same surname are from the same planet
df_test[["Surname", "HomePlanet"]].dropna().groupby("Surname").HomePlanet.nunique().eq(1).all()

True

In [18]:
# Use Surname to fill more missing HomePlanet values

# Number of missing values BEFORE
print(f"Training data: {df_train.HomePlanet.isna().sum()}")
print(f"Test data: {df_test.HomePlanet.isna().sum()}")

Training data: 111
Test data: 46


In [19]:
# Training data
df_sur_1 = (
    df_train[["Surname", "HomePlanet"]]
    .dropna()
    .groupby("Surname")
    .HomePlanet.first()
    .to_frame()
    .reset_index(drop=False)
)

In [20]:
df_1 = df_train.loc[
    df_train.Surname.notna() & df_train.Surname.isin(df_sur_1.Surname) & df_train.HomePlanet.isna(),
    ["Surname"],
].reset_index(drop=False)
df_2 = df_1.merge(df_sur_1, on="Surname").drop(columns="Surname").set_index("PassengerId")
df_train.loc[df_2.index, "HomePlanet"] = df_2.HomePlanet
del df_1, df_2

In [21]:
# Test data
df_sur_2 = (
    df_test[["Surname", "HomePlanet"]]
    .dropna()
    .groupby("Surname")
    .HomePlanet.first()
    .to_frame()
    .reset_index(drop=False)
)

In [22]:
# Consistency check
assert_frame_equal(
    df_sur_1.loc[df_sur_1.Surname.isin(df_sur_2.Surname), :].sort_values("Surname").reset_index(drop=True),
    df_sur_2.loc[df_sur_2.Surname.isin(df_sur_1.Surname), :].sort_values("Surname").reset_index(drop=True),
)

In [23]:
# To fix test data, I'll also use some training data. Combine all relevant data:
df_sur = pd.concat(
    [df_sur_1, df_sur_2.loc[~df_sur_2.Surname.isin(df_sur_1.Surname), :]],
    ignore_index=True,
)
del df_sur_1, df_sur_2
assert df_sur.Surname.nunique() == df_sur.shape[0]

In [24]:
df_1 = df_test.loc[
    df_test.Surname.notna() & df_test.Surname.isin(df_sur.Surname) & df_test.HomePlanet.isna(),
    ["Surname"],
].reset_index(drop=False)
df_2 = df_1.merge(df_sur, on="Surname").drop(columns="Surname").set_index("PassengerId")
df_test.loc[df_2.index, "HomePlanet"] = df_2.HomePlanet
del df_1, df_2, df_sur

In [25]:
# Number of missing values AFTER
print(f"Training data: {df_train.HomePlanet.isna().sum()}")
print(f"Test data: {df_test.HomePlanet.isna().sum()}")

Training data: 12
Test data: 5


## Impute some missing values of `VIP`

In [26]:
# No VIP passenger is from Earth
df_train.loc[df_train.VIP.notna() & (df_train.VIP == True) & df_train.HomePlanet.notna(), "HomePlanet"].ne(
    "Earth"
).all()

True

In [27]:
# Use HomePlanet to fill some missing VIP values

# Number of missing values BEFORE
print(f"Training data: {df_train.VIP.isna().sum()}")
print(f"Test data: {df_test.VIP.isna().sum()}")

Training data: 203
Test data: 93


In [28]:
df_train.loc[df_train.VIP.isna() & df_train.HomePlanet.notna() & (df_train.HomePlanet == "Earth"), "VIP"] = (
    False
)
df_test.loc[df_test.VIP.isna() & df_test.HomePlanet.notna() & (df_test.HomePlanet == "Earth"), "VIP"] = False

In [29]:
# Number of missing values AFTER
print(f"Training data: {df_train.VIP.isna().sum()}")
print(f"Test data: {df_test.VIP.isna().sum()}")

Training data: 86
Test data: 49


## Encode `HomePlanet` and `Destination`

In [30]:
# Convert HomePlanet to ordinal integers
enc = OrdinalEncoder().fit(df_train[["HomePlanet"]])
enc.categories_

[array(['Earth', 'Europa', 'Mars', nan], dtype=object)]

In [31]:
df_train["HomePlanetOrd"] = enc.transform(df_train[["HomePlanet"]]).flatten()
df_test["HomePlanetOrd"] = enc.transform(df_test[["HomePlanet"]]).flatten()
del enc

In [32]:
df_train.loc[["0002_01", "0003_01", "0009_01"], ["HomePlanet", "HomePlanetOrd"]]

Unnamed: 0_level_0,HomePlanet,HomePlanetOrd
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
0002_01,Earth,0.0
0003_01,Europa,1.0
0009_01,Mars,2.0


In [33]:
# Consistency checks
assert df_train.loc[df_train.HomePlanet.isna(), "HomePlanetOrd"].isna().all()
assert df_train.loc[df_train.HomePlanet.notna(), "HomePlanetOrd"].notna().all()

In [34]:
assert df_test.loc[df_test.HomePlanet.isna(), "HomePlanetOrd"].isna().all()
assert df_test.loc[df_test.HomePlanet.notna(), "HomePlanetOrd"].notna().all()

In [35]:
# Convert Destination to ordinal integers
enc = OrdinalEncoder().fit(df_train[["Destination"]])
enc.categories_

[array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', nan], dtype=object)]

In [36]:
df_train["DestinationOrd"] = enc.transform(df_train[["Destination"]]).flatten()
df_test["DestinationOrd"] = enc.transform(df_test[["Destination"]]).flatten()
del enc

In [37]:
df_train.loc[["0008_01", "0005_01", "0001_01"], ["Destination", "DestinationOrd"]]

Unnamed: 0_level_0,Destination,DestinationOrd
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
0008_01,55 Cancri e,0.0
0005_01,PSO J318.5-22,1.0
0001_01,TRAPPIST-1e,2.0


In [38]:
# Consistency checks
assert df_train.loc[df_train.Destination.isna(), "DestinationOrd"].isna().all()
assert df_train.loc[df_train.Destination.notna(), "DestinationOrd"].notna().all()

In [39]:
assert df_test.loc[df_test.Destination.isna(), "DestinationOrd"].isna().all()
assert df_test.loc[df_test.Destination.notna(), "DestinationOrd"].notna().all()

## More simple data imputation

In [40]:
# The "money features" are dominated by zeros. Then it's reasonable to fill all
# of their missing values with zero.
cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

# Number of missing values BEFORE
print("Training data:")
display(df_train[cols].isna().sum())

print("Test data:")
display(df_test[cols].isna().sum())

Training data:


RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
dtype: int64

Test data:


RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [41]:
df_train.loc[:, cols] = df_train[cols].fillna(0.0)
df_test.loc[:, cols] = df_test[cols].fillna(0.0)

In [42]:
# TotalSpent
df_train["TotalSpent"] = df_train[cols].agg("sum", axis=1)
df_test["TotalSpent"] = df_test[cols].agg("sum", axis=1)

In [43]:
# Fill some missing CryoSleep values based on TotalSpent

# Number of missing values BEFORE
print(f"Training data: {df_train.CryoSleep.isna().sum()}")
print(f"Test data: {df_test.CryoSleep.isna().sum()}")

Training data: 217
Test data: 93


In [44]:
df_train.loc[df_train.CryoSleep.isna() & df_train.TotalSpent.gt(0.0), "CryoSleep"] = False
df_test.loc[df_test.CryoSleep.isna() & df_test.TotalSpent.gt(0.0), "CryoSleep"] = False

In [45]:
# Number of missing values AFTER
print(f"Training data: {df_train.CryoSleep.isna().sum()}")
print(f"Test data: {df_test.CryoSleep.isna().sum()}")

Training data: 98
Test data: 38


In [46]:
# Passengers who were in cryo sleep spent NO MONEY
assert df_train.loc[df_train.CryoSleep.notna() & (df_train.CryoSleep == True), cols].eq(0.0).all(axis=None)
assert df_test.loc[df_test.CryoSleep.notna() & (df_test.CryoSleep == True), cols].eq(0.0).all(axis=None)

## New features from "money variables"

In [47]:
# Original money variables will be replaced with binary features. They indicate
# when the original variables were strictly positive.
df_train = df_train.join(df_train[cols].gt(0.0).rename(columns={col: f"Pos{col}" for col in cols})).drop(
    columns=cols
)
df_test = df_test.join(df_test[cols].gt(0.0).rename(columns={col: f"Pos{col}" for col in cols})).drop(
    columns=cols
)
del cols

In [48]:
# Power transformation of TotalSpent
transformer = PowerTransformer().fit(df_train[["TotalSpent"]])
transformer.lambdas_[0]

0.06549240721181425

In [49]:
df_train = df_train.assign(PTTotalSpent=transformer.transform(df_train[["TotalSpent"]]).flatten()).drop(
    columns="TotalSpent"
)
df_test = df_test.assign(PTTotalSpent=transformer.transform(df_test[["TotalSpent"]]).flatten()).drop(
    columns="TotalSpent"
)
del transformer

## New features from `Cabin`

In [50]:
# CabinDeck, CabinNum and CabinSide
df_train = df_train.join(
    df_train.Cabin.str.split("/", expand=True).rename(columns={0: "CabinDeck", 1: "CabinNum", 2: "CabinSide"})
).drop(columns="Cabin")

df_test = df_test.join(
    df_test.Cabin.str.split("/", expand=True).rename(columns={0: "CabinDeck", 1: "CabinNum", 2: "CabinSide"})
).drop(columns="Cabin")

In [51]:
# CabinDeck: Combine three categories into one
df_train.loc[df_train.CabinDeck.notna() & df_train.CabinDeck.isin(["D", "A", "T"]), "CabinDeck"] = "Other"
df_test.loc[df_test.CabinDeck.notna() & df_test.CabinDeck.isin(["D", "A", "T"]), "CabinDeck"] = "Other"

In [52]:
# Convert to ordinal integers
enc = OrdinalEncoder().fit(df_train[["CabinDeck"]])
enc.categories_

[array(['B', 'C', 'E', 'F', 'G', 'Other', nan], dtype=object)]

In [53]:
df_train["CabinDeckOrd"] = enc.transform(df_train[["CabinDeck"]]).flatten()
df_test["CabinDeckOrd"] = enc.transform(df_test[["CabinDeck"]]).flatten()
del enc

In [54]:
df_train.loc[
    ["0001_01", "0024_01", "0020_01", "0002_01", "0006_02", "0003_01"],
    ["CabinDeck", "CabinDeckOrd"],
]

Unnamed: 0_level_0,CabinDeck,CabinDeckOrd
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
0001_01,B,0.0
0024_01,C,1.0
0020_01,E,2.0
0002_01,F,3.0
0006_02,G,4.0
0003_01,Other,5.0


In [55]:
# Consistency checks
assert df_train.loc[df_train.CabinDeck.isna(), "CabinDeckOrd"].isna().all()
assert df_train.loc[df_train.CabinDeck.notna(), "CabinDeckOrd"].notna().all()

In [56]:
assert df_test.loc[df_test.CabinDeck.isna(), "CabinDeckOrd"].isna().all()
assert df_test.loc[df_test.CabinDeck.notna(), "CabinDeckOrd"].notna().all()

In [57]:
df_train = df_train.drop(columns="CabinDeck")
df_test = df_test.drop(columns="CabinDeck")

In [58]:
# Fill some missing CabinSide values using group data
# Passengers that belong to the same group were on the same side of the spaceship

# Number of missing values BEFORE
print(f"Training data: {df_train.CabinSide.isna().sum()}")
print(f"Test data: {df_test.CabinSide.isna().sum()}")

Training data: 199
Test data: 100


In [59]:
# Training data
df_1 = (
    df_train.loc[(df_train.Alone == False) & df_train.CabinSide.notna(), ["CabinSide", "Group"]]
    .groupby("Group", observed=True)
    .CabinSide.first()
    .to_frame()
    .reset_index(drop=False)
)
df_2 = df_train.loc[(df_train.Alone == False) & df_train.CabinSide.isna(), ["Group"]].reset_index(drop=False)
df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
df_train.loc[df_3.index, "CabinSide"] = df_3.CabinSide
del df_1, df_2, df_3

In [60]:
# Test data
df_1 = (
    df_test.loc[(df_test.Alone == False) & df_test.CabinSide.notna(), ["CabinSide", "Group"]]
    .groupby("Group", observed=True)
    .CabinSide.first()
    .to_frame()
    .reset_index(drop=False)
)
df_2 = df_test.loc[(df_test.Alone == False) & df_test.CabinSide.isna(), ["Group"]].reset_index(drop=False)
df_3 = df_2.merge(df_1, on="Group").drop(columns="Group").set_index("PassengerId")
df_test.loc[df_3.index, "CabinSide"] = df_3.CabinSide
del df_1, df_2, df_3

In [61]:
# Number of missing values AFTER
print(f"Training data: {df_train.CabinSide.isna().sum()}")
print(f"Test data: {df_test.CabinSide.isna().sum()}")

Training data: 99
Test data: 63


In [62]:
# Convert CabinSide to a boolean feature
df_train["CabinPort"] = np.nan
df_train.loc[df_train.CabinSide.notna(), "CabinPort"] = (
    df_train.loc[df_train.CabinSide.notna(), "CabinSide"] == "P"
)
df_train = df_train.drop(columns="CabinSide")

df_test["CabinPort"] = np.nan
df_test.loc[df_test.CabinSide.notna(), "CabinPort"] = (
    df_test.loc[df_test.CabinSide.notna(), "CabinSide"] == "P"
)
df_test = df_test.drop(columns="CabinSide")

## Discretize `Age`

In [63]:
# Discretize using quantiles and 4 bins
discretizer = KBinsDiscretizer(n_bins=4, strategy="quantile", encode="ordinal", random_state=333).fit(
    df_train.loc[df_train.Age.notna(), ["Age"]]
)
discretizer.bin_edges_

array([array([ 0., 19., 27., 38., 79.])], dtype=object)

In [64]:
df_train["DiscretizedAge4"] = np.nan
df_train.loc[df_train.Age.notna(), "DiscretizedAge4"] = discretizer.transform(
    df_train.loc[df_train.Age.notna(), ["Age"]]
)

df_test["DiscretizedAge4"] = np.nan
df_test.loc[df_test.Age.notna(), "DiscretizedAge4"] = discretizer.transform(
    df_test.loc[df_test.Age.notna(), ["Age"]]
)

del discretizer

In [65]:
# Discretize using quantiles and 5 bins
discretizer = KBinsDiscretizer(n_bins=5, strategy="quantile", encode="ordinal", random_state=333).fit(
    df_train.loc[df_train.Age.notna(), ["Age"]]
)
discretizer.bin_edges_

array([array([ 0., 18., 24., 31., 41., 79.])], dtype=object)

In [66]:
df_train["DiscretizedAge5"] = np.nan
df_train.loc[df_train.Age.notna(), "DiscretizedAge5"] = discretizer.transform(
    df_train.loc[df_train.Age.notna(), ["Age"]]
)

df_test["DiscretizedAge5"] = np.nan
df_test.loc[df_test.Age.notna(), "DiscretizedAge5"] = discretizer.transform(
    df_test.loc[df_test.Age.notna(), ["Age"]]
)

del discretizer

In [67]:
df_train = df_train.drop(columns="Age")
df_test = df_test.drop(columns="Age")

## Organize DataFrames and Save

In [68]:
cols_rm = [
    "CabinNum",
    "Destination",
    "Group",
    "HomePlanet",
    "Surname",
]
df_train = df_train.drop(columns=cols_rm)
df_test = df_test.drop(columns=cols_rm)
del cols_rm

In [69]:
cols_keep = [
    "Alone",
    "CompCntReduced",
    "HomePlanetOrd",
    "CryoSleep",
    "CabinDeckOrd",
    "CabinPort",
    "DestinationOrd",
    "DiscretizedAge4",
    "DiscretizedAge5",
    "VIP",
    "PosRoomService",
    "PosFoodCourt",
    "PosShoppingMall",
    "PosSpa",
    "PosVRDeck",
    "PTTotalSpent",
    "Transported",
]
df_train = df_train[cols_keep]
cols_keep.pop()
df_test = df_test[cols_keep]
del cols_keep

In [70]:
df_train = cast(pd.DataFrame, df_train)
df_test = cast(pd.DataFrame, df_test)

In [71]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Alone            8693 non-null   bool   
 1   CompCntReduced   8693 non-null   object 
 2   HomePlanetOrd    8681 non-null   float64
 3   CryoSleep        8595 non-null   object 
 4   CabinDeckOrd     8494 non-null   float64
 5   CabinPort        8594 non-null   object 
 6   DestinationOrd   8511 non-null   float64
 7   DiscretizedAge4  8514 non-null   float64
 8   DiscretizedAge5  8514 non-null   float64
 9   VIP              8607 non-null   object 
 10  PosRoomService   8693 non-null   bool   
 11  PosFoodCourt     8693 non-null   bool   
 12  PosShoppingMall  8693 non-null   bool   
 13  PosSpa           8693 non-null   bool   
 14  PosVRDeck        8693 non-null   bool   
 15  PTTotalSpent     8693 non-null   float64
 16  Transported      8693 non-null   bool   
dtypes: bool(7)

In [72]:
df_train.isna().sum()

Alone                0
CompCntReduced       0
HomePlanetOrd       12
CryoSleep           98
CabinDeckOrd       199
CabinPort           99
DestinationOrd     182
DiscretizedAge4    179
DiscretizedAge5    179
VIP                 86
PosRoomService       0
PosFoodCourt         0
PosShoppingMall      0
PosSpa               0
PosVRDeck            0
PTTotalSpent         0
Transported          0
dtype: int64

In [73]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4277 entries, 0013_01 to 9277_01
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Alone            4277 non-null   bool   
 1   CompCntReduced   4277 non-null   object 
 2   HomePlanetOrd    4272 non-null   float64
 3   CryoSleep        4239 non-null   object 
 4   CabinDeckOrd     4177 non-null   float64
 5   CabinPort        4214 non-null   object 
 6   DestinationOrd   4185 non-null   float64
 7   DiscretizedAge4  4186 non-null   float64
 8   DiscretizedAge5  4186 non-null   float64
 9   VIP              4228 non-null   object 
 10  PosRoomService   4277 non-null   bool   
 11  PosFoodCourt     4277 non-null   bool   
 12  PosShoppingMall  4277 non-null   bool   
 13  PosSpa           4277 non-null   bool   
 14  PosVRDeck        4277 non-null   bool   
 15  PTTotalSpent     4277 non-null   float64
dtypes: bool(6), float64(6), object(4)
memory usage: 521.7+ K

In [74]:
df_test.isna().sum()

Alone                0
CompCntReduced       0
HomePlanetOrd        5
CryoSleep           38
CabinDeckOrd       100
CabinPort           63
DestinationOrd      92
DiscretizedAge4     91
DiscretizedAge5     91
VIP                 49
PosRoomService       0
PosFoodCourt         0
PosShoppingMall      0
PosSpa               0
PosVRDeck            0
PTTotalSpent         0
dtype: int64

In [75]:
df_train.to_csv(data_dir / "train_prep.csv", index=True)
df_test.to_csv(data_dir / "test_prep.csv", index=True)