### Spaceship Titanic with fastai

Competition [Link](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In [1]:
#|default_exp app

In [2]:
#The Following cell of code is used everytime FASTAI library is used.
#They tell the notebook to reload any changes made to any libraries used.
#They also ensure that any graphs are plotted are shown in this notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from IPython.display import Image, display_svg, SVG

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('spaceship-titanic')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [8]:
#| export
if iskaggle:
    path = Path('../input/spaceship-titanic')
    ! pip install -q dataset

Import CSV's as Pandas Dataframes

In [9]:
#| export
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sample_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [11]:
df['CryoSleep'].isnull().count

<bound method Series.count of 0       False
1       False
2       False
3       False
4       False
        ...  
8688    False
8689    False
8690    False
8691    False
8692    False
Name: CryoSleep, Length: 8693, dtype: bool>

In [12]:
cryo_true_df = df[df['CryoSleep'] == True]

In [13]:
cryo_true_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
18,0016_01,Mars,True,F/5/P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0,Alus Upead,True
21,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Almary Brantuarez,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8679,9267_02,Europa,True,E/607/S,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,Sabi Opshaft,True
8680,9268_01,Earth,True,G/1505/P,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Agnesa Baldson,True
8681,9270_01,Earth,True,G/1497/S,55 Cancri e,33.0,False,0.0,0.0,0.0,0.0,0.0,Lan Mckinsond,True
8684,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True


In [16]:
cryo_amenities_df = ['CryoSleep', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [17]:
null_cryo_df = df.loc[df['CryoSleep'].isnull(), cryo_amenities_df]

In [19]:
df['CryoSleep'].isnull().sum()

217

In [20]:
null_cryo_df

Unnamed: 0,CryoSleep,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
92,,0.0,0.0,0.0,0.0,0.0
98,,0.0,0.0,570.0,2.0,131.0
104,,0.0,331.0,0.0,0.0,1687.0
111,,0.0,0.0,0.0,0.0,
152,,0.0,985.0,0.0,5.0,0.0
...,...,...,...,...,...,...
8620,,0.0,0.0,0.0,0.0,0.0
8651,,0.0,0.0,0.0,0.0,0.0
8664,,0.0,0.0,0.0,0.0,0.0
8675,,1030.0,1015.0,0.0,11.0,


here

In [36]:
amenities_mask =(null_cryo_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] > 0).any(axis=1)
amenities_mask

92      False
98       True
104      True
111     False
152      True
        ...  
8620    False
8651    False
8664    False
8675     True
8687     True
Length: 217, dtype: bool

In [None]:
# null_cryo_df['CyroSleep'] = np.where(amenities_mask, False, True)

In [41]:
df.loc[null_cryo_df.index, cryo_amenities_df].head(), df.loc[null_cryo_df.index, cryo_amenities_df].shape

(    CryoSleep  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
 92        NaN          0.0        0.0           0.0  0.0     0.0
 98        NaN          0.0        0.0         570.0  2.0   131.0
 104       NaN          0.0      331.0           0.0  0.0  1687.0
 111       NaN          0.0        0.0           0.0  0.0     NaN
 152       NaN          0.0      985.0           0.0  5.0     0.0,
 (217, 6))

In [44]:
df.loc[null_cryo_df.index, 'CryoSleep']

92      NaN
98      NaN
104     NaN
111     NaN
152     NaN
       ... 
8620    NaN
8651    NaN
8664    NaN
8675    NaN
8687    NaN
Name: CryoSleep, Length: 217, dtype: object

In [45]:
df.loc[null_cryo_df.index, 'CryoSleep'] = np.where(amenities_mask, False, True)

In [46]:
df['CryoSleep'].isnull().sum()

0

In [47]:
df['RoomService'].isnull().sum()

181

In [None]:
null_cryo_df

In [None]:
test_df.shape

In [None]:
test_df.head()

This name on such a small dataset could cause overfitting, so I'm going to try splitting into families by last name. I'll also be dropping `Name` as a category afterward

In [None]:
df['surname'] = df['Name'].str.split(' ').str[1]

In [None]:
test_df['surname'] = test_df['Name'].str.split(' ').str[1]

In [None]:
df = df.drop(['Name'], axis=1)

In [None]:
test_df = test_df.drop(['Name'], axis=1)

Looking at this there's some extra data we can extract to new columns, *PassengerId* looks like it's by group number and the number in a particular group, split with an underscore `_` 

In [None]:
df['group_num'] = df['PassengerId'].str.split('_').str[0]
df['num_w_in_group'] = df['PassengerId'].str.split('_').str[1]

In [None]:
test_df['group_num'] =test_df['PassengerId'].str.split('_').str[0]
test_df['num_w_in_group'] =test_df['PassengerId'].str.split('_').str[1]

The same is true for the Cabin, there are 3 different values to analyze here, so let's split them up as well

In [None]:
df['deck'] = df['Cabin'].str.split('/').str[0]
df['room_num'] = df['Cabin'].str.split('/').str[1]
df['side'] = df['Cabin'].str.split('/').str[2]

In [None]:
test_df['deck'] = test_df['Cabin'].str.split('/').str[0]
test_df['room_num'] = test_df['Cabin'].str.split('/').str[1]
test_df['side'] = test_df['Cabin'].str.split('/').str[2]

Take the max number of the last two digits in PassengerId given that the first four numbers are the same

In [None]:
df['num_w_in_group'] = df['num_w_in_group'].astype(int)

In [None]:
test_df['num_w_in_group'] = test_df['num_w_in_group'].astype(int)

In [None]:
max_num_in_group = df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [None]:
test_max_num_in_group = test_df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [None]:
len(max_num_in_group)

In [None]:
max_num_in_group.head()

In [None]:
df['num_in_group'] = df['group_num'].map(max_num_in_group)

In [None]:
test_df['num_in_group'] = test_df['group_num'].map(test_max_num_in_group)

In [None]:
df.head()

Finding if everyone boarding is a family group or not by last name:
- Find if the count of family group == num_in_group

In [None]:
boarded_together = df.groupby('surname')['num_in_group'].nunique() == 1

In [None]:
test_boarded_together = test_df.groupby('surname')['num_in_group'].nunique() == 1

In [None]:
df['fam_board_together'] = df['surname'].map(boarded_together)

In [None]:
test_df['fam_board_together'] = test_df['surname'].map(test_boarded_together)

In [None]:
df.head()

In [None]:
test_df.head()

In [None]:
def get_null_counts(df):
    null_count_df = pd.DataFrame((df.isnull().sum()).sort_values(ascending=False).reset_index())
    null_count_df.columns = ['column_name','null_counts']
    null_count_df = null_count_df.query("column_name!='Transported'")
    return null_count_df

null_= get_null_counts(df)
null_.style.background_gradient(cmap='summer')

In [None]:
test_null = get_null_counts(test_df)
null_.style.background_gradient(cmap='winter')

Boolean values converted to 0's and 1's

In [None]:
def bool_switch(df, col_name):
    encoder = LabelEncoder()
    df[f'{col_name}'] = encoder.fit_transform(df[f'{col_name}']) 

In [None]:
bool_switch(df, col_name='Transported')
bool_switch(df, col_name='VIP')
bool_switch(df, col_name='CryoSleep')
bool_switch(df, col_name="fam_board_together")

In [None]:
df.shape

In [None]:
bool_switch(test_df, col_name='VIP')
bool_switch(test_df, col_name='CryoSleep')
bool_switch(test_df, col_name="fam_board_together")

In [None]:
test_df.shape

### Preparing Data

Declare dependant variable(y-axis)

In [None]:
#| export
dep_var = 'Transported'

In [None]:
df['Destination'].sample

Add tabular processes to transform categorical variables to something similar to `pd.Categorical`, and fill in missing/na values

In [None]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [None]:
#| export
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
test_cont, test_cat = cont_cat_split(test_df, 1, dep_var=dep_var)

In [None]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, 
                    y_block=CategoryBlock(), 
                    splits=RandomSplitter(valid_pct=0.2, seed=42)(df)).dataloaders(bs=128)

In [None]:
to.fill_strategy

In [None]:
test_to = TabularPandas(test_df, procs, test_cat, test_cont, y_names=None, 
                    y_block=CategoryBlock(), 
                    splits=None).dataloaders(bs=128)

In [None]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [None]:
test_xs = test_to.train.xs

In [None]:
xs.head()

In [None]:
xs.shape

In [None]:
test_xs.head()

In [None]:
test_xs.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
def rf(xs, y, n_estimators=40, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_features=max_features, min_samples_leaf=min_samples_leaf, 
        oob_score=True).fit(xs,y)

In [None]:
m = rf(xs,y)

In [None]:
def pred_acc(m, valid_xs=valid_xs):
    y_pred = m.predict(valid_xs)
    accuracy = accuracy_score(valid_y, y_pred)
    return accuracy

In [None]:
pred_acc(m)

In [None]:
def rf_feat_importances(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importances(m, xs)
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi)

In [None]:
cluster_columns(xs)

In [None]:
to_keep = fi[fi.imp > 0.005].cols
len(to_keep), len(fi)

In [None]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
test_xs_imp = test_xs[to_keep]

In [None]:
len(test_xs_imp)

In [None]:
m = rf(xs_imp, y)

In [None]:
pred_acc(m, valid_xs=valid_xs_imp)

In [None]:
plot_fi(rf_feat_importances(m, xs_imp))

In [None]:
cluster_columns(xs_imp)

In [None]:
def get_oob(df):
    m = RandomForestClassifier(n_estimators=40, min_samples_leaf=15, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)


In [None]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in xs_imp.columns}

In [None]:
{c:get_oob(xs.drop(c, axis=1)) for c in xs.columns}

In [None]:
xs_final = xs_imp
valid_xs_final = valid_xs_imp

In [None]:
test_xs_final = test_xs_imp

In [None]:
xs_final.head()

In [None]:
pred_acc(m, valid_xs=valid_xs_final)

In [None]:
valid_xs_final.columns

In [None]:
# test_xs = [test_xs.drop([x], axis=1) for x in test_xs if x not in valid_xs_final]

In [None]:
# for x in test_xs.columns:
#     if x not in valid_xs_final.columns:
#         test_xs = test_xs.drop([x], axis=1)

In [None]:
len(test_xs_final)

In [None]:
len(test_xs.columns), len(valid_xs_final.columns)

In [None]:
test_xs.columns, valid_xs_final.columns

In [None]:
preds = m.predict(test_xs_final)

In [None]:
preds

In [None]:
sample_df.head()

In [None]:
sample_df['Transported'] = preds.astype(bool)

In [None]:
sample_df.value_counts()

In [None]:
sub_df = sample_df

In [None]:
sub_df

In [None]:
sub_df.to_csv('submission.csv', index=False)