In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction - Spaceship Titanic
> Spaceship Titanic - Random Forest
> https://www.kaggle.com/competitions/spaceship-titanic

# EDA

In [None]:
# Loading the data for EDA
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df.head(3)

### PassengerId

In [None]:
# Extracting the group composant of PassengerId
def extract_group(df, name="Group"):
    df[name] = df["PassengerId"].str[:-3]
    return df

In [None]:
df = extract_group(df)
df.head(3)

In [None]:
# Checking if the group size could have an impact
fig, ax = plt.subplots(1, 1, figsize=(20,5))

group_group = df.groupby(["Transported","Group"]).size().reset_index(name="count")
sns.countplot(data=group_group, x="count", hue="Transported", ax=ax)
plt.xlabel("Group size")
plt.ylabel("Count")
plt.show()

In [None]:
# Adding GroupSize to the dataframe
def category_size(df, category, size_name):
    group_sizes = df.groupby(category).size().reset_index(name=size_name)
    df = df.merge(group_sizes, on=category)
    return df

In [None]:
df = category_size(df, "Group", "GroupSize")
df.head(3)

### HomePlanet

In [None]:
# Link between HomePlanet and Transported
fig, ax = plt.subplots(1,2, figsize=(20,5))
df["HomePlanet"] = df["HomePlanet"].fillna("Unknown")
sns.histplot(data=df, x="HomePlanet", hue="Transported", ax=ax[0])
sns.countplot(data=df, x="HomePlanet", ax=ax[1])
plt.show()

In [None]:
# Maybe we can estimate HomePlanet Unknown using Group
group_home_planet = df[df["HomePlanet"] != "Unknown"].groupby('Group')['HomePlanet'].nunique()

print("Mean of not unique HomePlanet by group:", (group_home_planet == 1).mean(), " for not Unknown HomePlanet")

In [None]:
# Because 98,6% will be from same HomePlanet, we can estimate unknown using their group
def replace_unknown_with_group_value(df, group, category, null="Unknown"):
    df_unknown = df[df[category] == null][group]
    for grp in df_unknown:
        vals, replacement = df[df[group] == grp][category].values, null
        for v in range (len(vals)):
            if vals[v] != null:
                replacement = vals[v]
                break
        df.loc[(df[group] == grp) & (df[category] == null), category] = replacement
    return df

shape_before = df[df["HomePlanet"] == "Unknown"].shape
df = replace_unknown_with_group_value(df, "Group", "HomePlanet")
shape_after = df[df["HomePlanet"] == "Unknown"].shape

print("Before",shape_before[0],"after", shape_after[0], "we have estimated 90 rows")

### Destination

In [None]:
# Link between Destination and Transported
fig, ax = plt.subplots(1,2, figsize=(20,5))
df["Destination"] = df["Destination"].fillna("Unknown")
sns.histplot(data=df, x="Destination", hue="Transported", ax=ax[0])
sns.countplot(data=df, x="Destination", ax=ax[1])
plt.show()

In [None]:
# Maybe we can estimate Destination Unknown using Group
group_destination = df[df["Destination"] != "Unknown"].groupby('Group')['Destination'].nunique()

print("Mean of not unique Destination by group:", (group_destination == 1).mean(), " for not Unknown Destination")

In [None]:
shape_before = df[df["Destination"] == "Unknown"].shape
df = replace_unknown_with_group_value(df, "Group", "Destination")
shape_after = df[df["Destination"] == "Unknown"].shape

print("Before",shape_before[0],"after", shape_after[0], "we have estimated 79 rows")

### Traject

In [None]:
# Traject is a category that explains from where to where the passengers travel
def create_traject(df, name="Traject", frm="HomePlanet", to="Destination"):
    df[name] = df[frm] + ">" + df[to]
    return df

In [None]:
df = create_traject(df)

In [None]:
# Link between Traject and Transported
fig, ax = plt.subplots(1,1, figsize=(30,10))
sns.histplot(data=df, x="Traject", hue="Transported", ax=ax)
plt.show()

### RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

In [None]:
# Link between RoomService and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='RoomService', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "RoomService"]].groupby("Transported").mean()
med = df[["Transported", "RoomService"]].groupby("Transported").median()
print(avg)
print(med)

In [None]:
# Link between FoodCourt and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='FoodCourt', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "FoodCourt"]].groupby("Transported").mean()
med = df[["Transported", "FoodCourt"]].groupby("Transported").median()
print(avg)
print(med)

In [None]:
# Link between ShoppingMall and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='ShoppingMall', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "ShoppingMall"]].groupby("Transported").mean()
med = df[["Transported", "ShoppingMall"]].groupby("Transported").median()
print(avg)
print(med)

In [None]:
# Link between Spa and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='Spa', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "Spa"]].groupby("Transported").mean()
med = df[["Transported", "Spa"]].groupby("Transported").median()
print(avg)
print(med)

In [None]:
# Link between VRDeck and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='VRDeck', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "VRDeck"]].groupby("Transported").mean()
med = df[["Transported", "VRDeck"]].groupby("Transported").median()
print(avg)
print(med)

In [None]:
# null values for costs variables
df[["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]].isna().sum()

In [None]:
# Check if people in CryoSleep can pay some things, just to be sure
group_costs = df[["CryoSleep", "FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]].groupby("CryoSleep").mean().reset_index()
group_costs

In [None]:
def cryosleep_na_to_zero(df, col, val=0.0):
    df.loc[(df["CryoSleep"] == True) & (df[col].isna()), col] = val
    return df

In [None]:
# Find FoodCourt, ShoppingMall, RoomService, Spa, VRDeck NaN where CryoSleep true
for i in ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]:
    df = cryosleep_na_to_zero(df, i)

df[["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]].isna().sum()

### RoomService + Spa + VRDeck = RS_S_VRD

In [None]:
def add_columns(df, cols, name):
    df[name] = df[cols].sum(axis=1)
    return df

In [None]:
df = add_columns(df, ["RoomService", "Spa", "VRDeck"], "RS_S_VRD")
df.head(3)

In [None]:
# Link between R_S_VRD and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='RS_S_VRD', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "RS_S_VRD"]].groupby("Transported").mean()
med = df[["Transported", "RS_S_VRD"]].groupby("Transported").median()
print(avg)
print(med)

### FoodCourt + ShoppingMall = FC_SM

In [None]:
df = add_columns(df, ["FoodCourt", "ShoppingMall"], "FC_SM")
df.head(3)

In [None]:
# Link between FC_SM and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='FC_SM', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "FC_SM"]].groupby("Transported").mean()
med = df[["Transported", "FC_SM"]].groupby("Transported").median()
print(avg)
print(med)

### FoodCourt + ShoppingMall + RoomService + Spa + VRDeck = TotalExpenses

In [None]:
df = add_columns(df, ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"], "TotalExpenses")
df.head(3)

In [None]:
# Link between TotalExpenses and Transported
fig, ax = plt.subplots(1,1, figsize=(30,7))
sns.boxplot(x='Transported', y='TotalExpenses', data=df, ax=ax)
plt.show()

# Mean and Median
avg = df[["Transported", "TotalExpenses"]].groupby("Transported").mean()
med = df[["Transported", "TotalExpenses"]].groupby("Transported").median()
print(avg)
print(med)

### VIP

In [None]:
# Link between VIP and Transported
fig, ax = plt.subplots(1,2, figsize=(20,5))
df["VIP"] = df["VIP"].astype("string").fillna("Unknown")
sns.histplot(data=df, x="VIP", hue="Transported", ax=ax[0])
sns.countplot(data=df, x="VIP", ax=ax[1])
plt.show()

In [None]:
# Try to estimate Unknown values
df_ukw_vip = df[["VIP", "FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck", "TotalExpenses"]].groupby("VIP").mean().reset_index()
df_ukw_vip

In [None]:
def vip_cost(df, vip=4425.47, not_vip=1371.50):
    df.loc[(abs(df["TotalExpenses"] - vip) > abs(df["TotalExpenses"] - not_vip)), "VIP"] = "False"
    df.loc[(abs(df["TotalExpenses"] - vip) <= abs(df["TotalExpenses"] - not_vip)), "VIP"] = "True"
    return df

In [None]:
df = vip_cost(df)

In [None]:
df["VIP"].isna().sum()

### Cabin

In [None]:
def cabin_parts(df, cabin="Cabin", deck="Deck", num="Num", side="Side"):
    cabin_splitted = df[cabin].str.split("/", expand=True)
    cabin_splitted.columns = [deck, num, side]
    df = pd.concat([df, cabin_splitted], axis=1)
    return df

In [None]:
df = cabin_parts(df)

In [None]:
# Find impacts upon Transported
fig, ax = plt.subplots(2,1, figsize=(20,14))
df["Side"] = df["Side"].astype("string").fillna("Unknown")
df["Deck"] = df["Deck"].astype("string").fillna("Unknown")
df["Num"] = df["Num"].astype("string").fillna("Unknown")
sns.countplot(data=df, x="Side", hue="Transported", ax=ax[0])
sns.countplot(data=df, x="Deck", hue="Transported", ax=ax[1])
plt.show()

### Age

In [None]:
# Link between Age and Transported
fig, ax = plt.subplots(1,1, figsize=(20,5))
sns.histplot(data=df, x="Age", hue="Transported", ax=ax)
plt.show()

# Data preparation

In [None]:
# Loading the data for submission
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
# Group creation
df_train = extract_group(df_train)
df_test = extract_group(df_test)

In [None]:
# HomePlanet null reduction
df_train = replace_unknown_with_group_value(df_train, "Group", "HomePlanet")
df_test = replace_unknown_with_group_value(df_test, "Group", "HomePlanet")

In [None]:
# Destination null reduction
df_train = replace_unknown_with_group_value(df_train, "Group", "Destination")
df_test = replace_unknown_with_group_value(df_test, "Group", "Destination")

In [None]:
# Traject creation
df_train = create_traject(df_train)
df_test = create_traject(df_test)

In [None]:
# Costs null reduction
for i in ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]:
    ## Via CryoSleep
    df_train = cryosleep_na_to_zero(df_train, i)
    df_test = cryosleep_na_to_zero(df_test, i)
    ## Via mean
    df_train[i] = df_train[i].fillna(df_train[i].mean())
    df_test[i] = df_test[i].fillna(df_test[i].mean())

In [None]:
# RS_S_VRD creation
df_train = add_columns(df_train, ["RoomService", "Spa", "VRDeck"], "RS_S_VRD")
df_test = add_columns(df_test, ["RoomService", "Spa", "VRDeck"], "RS_S_VRD")

In [None]:
# FC_SM creation
df_train = add_columns(df_train, ["FoodCourt", "ShoppingMall"], "FC_SM")
df_test = add_columns(df_test, ["FoodCourt", "ShoppingMall"], "FC_SM")

In [None]:
# TotalExpenses creation
df_train = add_columns(df_train, ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"], "TotalExpenses")
df_test = add_columns(df_test, ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"], "TotalExpenses")

In [None]:
# VIP null reduction
df_train = vip_cost(df_train)
df_test = vip_cost(df_test)

In [None]:
# Cabin tokenization
df_train = cabin_parts(df_train)
df_test = cabin_parts(df_test)

In [None]:
# Age null reduction
df_train["Age"] = df_train["Age"].fillna(df_train["Age"].mean())
df_test["Age"] = df_test["Age"].fillna(df_test["Age"].mean())

In [None]:
# Categorization
to_categories = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP", 
                 "Group", "Traject", "Deck", "Num", "Side"]

for c in to_categories:
    df_train[c] = df_train[c].astype("category").cat.codes
    df_test[c] = df_test[c].astype("category").cat.codes

df_train["Transported"] = df_train["Transported"].astype("category").cat.codes

# Model training

In [None]:
X_cols = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "Age", "VIP",
          "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Group", 
          "Traject", "RS_S_VRD", "FC_SM", "TotalExpenses", "Deck", "Num", "Side"]

X_train = df_train[X_cols]
y_train = df_train["Transported"]

X_test = df_test[X_cols]
y_pred = df_test["PassengerId"]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_rf = RandomForestClassifier().fit(X_train, y_train) # Basic model is 0.78863 I need to be > 0.8
print("model_rf:", cross_val_score(model_rf, X_train, y_train).mean())

In [None]:
prediction = model_rf.predict(X_test)
prediction

In [None]:
df_pred = pd.DataFrame({"PassengerId":y_pred, "Transported":prediction})
df_pred["Transported"] = df_pred["Transported"].map({0: False, 1: True})
df_pred.head(3)

In [None]:
df_pred.to_csv("/kaggle/working/submission.csv",index=False)

# To go further

- Try to estimates more nan values
- Try to tune random forest
- Try to use another model or ensemble