# Spaceship Titanic with FastAI
## Setup

In [None]:
from pathlib import Path
import os

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/spaceship-titanic')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('titanic')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [None]:
from fastai.tabular.all import *

pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

In [None]:
import torch, numpy as np, pandas as pd
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.width', 140)

## Prep Data

In [None]:
df = pd.read_csv(path/'train.csv')

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
def fill_num_na_zero(df):
    cols_to_fill = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    df[cols_to_fill] = df[cols_to_fill].fillna(0)

In [None]:
def convert_obj_to_bool(df):
    df['CryoSleep'] = df.CryoSleep.astype(bool)
    df['VIP'] = df.VIP.astype(bool)
    df['Transported'] = df.Transported.astype(bool)

In [None]:
def add_features(df): 
    cols_to_log = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
    for col in cols_to_log:
        df['Log'+col] = np.log1p(df[col])
    df['LogTotalSpending'] = np.log1p(df[cols_to_log].sum(axis=1))
    
    df['CabinDeck'] = df.Cabin.str[0]
    df['CabinSide'] = df.Cabin.str[-1]
    
    df['GroupId'] = df.PassengerId.str.split('_', expand=True)[0]
    df['GroupSize'] = df.groupby('GroupId')['GroupId'].transform('count')
    df['Alone'] = df.GroupSize==1

In [None]:
fill_num_na_zero(df)
convert_obj_to_bool(df)
add_features(df)

In [None]:
df

In [None]:
splits = RandomSplitter(seed=42)(df)

In [None]:
dls = TabularPandas(
    df, splits=splits,
    procs = [Categorify, FillMissing, Normalize],
    cat_names=["HomePlanet","Destination","CabinDeck","CabinSide"],
    cont_names=['Age', 'CryoSleep', 'VIP', 'LogRoomService', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck', 'LogTotalSpending', 'GroupSize', 'Alone'],
    y_names="Transported", y_block = CategoryBlock(),
).dataloaders(path=".")

## Train the model

In [None]:
learn = tabular_learner(dls, metrics=accuracy, layers=[10, 10])

In [None]:
learn.lr_find(suggest_funcs=(slide, valley))

In [None]:
learn.fit(50, lr=0.03)

## Submit to Kaggle

In [None]:
tst_df = pd.read_csv(path/'test.csv')
fill_num_na_zero(tst_df)
tst_df['CryoSleep'] = tst_df.CryoSleep.astype(bool)
tst_df['VIP'] = tst_df.VIP.astype(bool)
csleep_mode = tst_df.CryoSleep.mode().iloc[0]
tst_df.CryoSleep.fillna(csleep_mode, inplace=True)
add_features(tst_df)

In [None]:
tst_dl = learn.dls.test_dl(tst_df)

In [None]:
preds,_ = learn.get_preds(dl=tst_dl)

In [None]:
tst_df['Transported'] = (preds[:,1]>0.5)
sub_df = tst_df[['PassengerId','Transported']]
sub_df.to_csv('sub_fastai.csv', index=False)

In [None]:
!head sub_fastai.csv

## Ensembling Models

In [None]:
def ensemble():
    learn = tabular_learner(dls, metrics=accuracy, layers=[10,10])
    with learn.no_bar(),learn.no_logging():
        learn.fit(35, lr=0.03)
    return learn.get_preds(dl=tst_dl)[0]

In [None]:
learns = [ensemble() for _ in range(5)]

In [None]:
ens_preds = torch.stack(learns).mean(0)

In [None]:
tst_df['Transported'] = ens_preds[:,1]>0.5
sub_df = tst_df[['PassengerId','Transported']]
sub_df.to_csv('ens_sub.csv', index=False)