### Spaceship Titanic with fastai

Competition [Link](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In [None]:
#|default_exp app

In [None]:
#The Following cell of code is used everytime FASTAI library is used.
#They tell the notebook to reload any changes made to any libraries used.
#They also ensure that any graphs are plotted are shown in this notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from IPython.display import Image, display_svg, SVG

In [None]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [None]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [None]:
#| export
path = Path('spaceship-titanic')

In [None]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [None]:
#| export
if iskaggle:
    path = Path('../input/spaceship-titanic')
    ! pip install -q dataset

Import CSV's as Pandas Dataframes

In [None]:
#| export
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sample_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)

In [None]:
df.head()

#### Infer Cryosleep NaN by Amenities Use

In [None]:
df['CryoSleep'].isnull().count

In [None]:
df[np.where(df['CryoSleep'] == True, True, False)].head()

In [None]:
df[np.where(df['CryoSleep'] == True, True, False)].shape

In [None]:
cryo_amenities_df = ['CryoSleep', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
null_cryo_df = df.loc[df['CryoSleep'].isnull(), cryo_amenities_df]
test_null_cryo_df = test_df.loc[df['CryoSleep'].isnull(), cryo_amenities_df]

In [None]:
df['CryoSleep'].isnull().sum()

In [None]:
null_cryo_df

here

In [None]:
amenities_mask =(null_cryo_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] > 0).any(axis=1)
test_amenities_mask =(test_null_cryo_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] > 0).any(axis=1)
amenities_mask

In [None]:
df.loc[null_cryo_df.index, cryo_amenities_df].head(), df.loc[null_cryo_df.index, cryo_amenities_df].shape

In [None]:
df.loc[null_cryo_df.index, 'CryoSleep']

In [None]:
df.loc[null_cryo_df.index, 'CryoSleep'] = np.where(amenities_mask, False, True)
test_df.loc[test_null_cryo_df.index, 'CryoSleep'] = np.where(test_amenities_mask, False, True)

In [None]:
df['CryoSleep'].isnull().sum()

#### Replace NaN Amenities Values with 0

In [None]:
df['RoomService'].isnull().sum()

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].isnull().sum()

In [None]:
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
df[amenities] = df[amenities].fillna(0)
test_df[amenities] = test_df[amenities].fillna(0)

#### Names to Surnames

This name on such a small dataset could cause overfitting, so I'm going to try splitting into families by last name. I'll also be dropping `Name` as a category afterward

In [None]:
df['surname'] = df['Name'].str.split(' ').str[1]

In [None]:
test_df['surname'] = test_df['Name'].str.split(' ').str[1]

In [None]:
df = df.drop(['Name'], axis=1)

In [None]:
test_df = test_df.drop(['Name'], axis=1)

#### Split PassengerId and Room

Looking at this there's some extra data we can extract to new columns, *PassengerId* looks like it's by group number and the number in a particular group, split with an underscore `_` 

In [None]:
df['group_num'] = df['PassengerId'].str.split('_').str[0]
df['num_w_in_group'] = df['PassengerId'].str.split('_').str[1]

In [None]:
test_df['group_num'] =test_df['PassengerId'].str.split('_').str[0]
test_df['num_w_in_group'] =test_df['PassengerId'].str.split('_').str[1]

The same is true for the Cabin, there are 3 different values to analyze here, so let's split them up as well

In [None]:
df['deck'] = df['Cabin'].str.split('/').str[0]
df['room_num'] = df['Cabin'].str.split('/').str[1]
df['side'] = df['Cabin'].str.split('/').str[2]

In [None]:
test_df['deck'] = test_df['Cabin'].str.split('/').str[0]
test_df['room_num'] = test_df['Cabin'].str.split('/').str[1]
test_df['side'] = test_df['Cabin'].str.split('/').str[2]

Take the max number of the last two digits in PassengerId given that the first four numbers are the same

In [None]:
df['num_w_in_group'] = df['num_w_in_group'].astype(int)

In [None]:
test_df['num_w_in_group'] = test_df['num_w_in_group'].astype(int)

In [None]:
max_num_in_group = df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [None]:
test_max_num_in_group = test_df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [None]:
len(max_num_in_group)

In [None]:
max_num_in_group.head()

In [None]:
df['num_in_group'] = df['group_num'].map(max_num_in_group)

In [None]:
test_df['num_in_group'] = test_df['group_num'].map(test_max_num_in_group)

In [None]:
df.head()

Finding if everyone boarding is a family group or not by last name:
- Find if the count of family group == num_in_group

In [None]:
boarded_together = df.groupby('surname')['num_in_group'].nunique() == 1

In [None]:
test_boarded_together = test_df.groupby('surname')['num_in_group'].nunique() == 1

In [None]:
df['fam_board_together'] = df['surname'].map(boarded_together)

In [None]:
test_df['fam_board_together'] = test_df['surname'].map(test_boarded_together)

In [None]:
df.head()

In [None]:
test_df.head()

#### Replace NaN Home and Destination with Group Values

In [None]:
df.loc[np.where(df['HomePlanet'].isnull())]

In [None]:
df.groupby('group_num')['HomePlanet'].value_counts()

In [None]:
mode_planet = df.groupby('group_num')['HomePlanet'].apply(lambda x: x.mode(dropna=True))

In [None]:
mode_planet

In [None]:
np.where(df['HomePlanet'].isnull())

In [None]:
df.groupby('group_num')['HomePlanet'].size()

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts')

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index()

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index().sort_values('counts')

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index().sort_values('counts', ascending=False)

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates()

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates(subset='group_num')

In [None]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates(subset='group_num').drop(columns='counts')

In [None]:
mode_planet.head()

In [None]:
# create a function to replace NaN values with mode values
def fillna_mode(group):
    mode_series = group['HomePlanet'].mode()
    if mode_series.empty:
        # if mode Series is empty, return the original group
        return group
    else:
        # otherwise, fill NaN values with the mode value
        mode_value = mode_series[0]
        group['HomePlanet'].fillna(mode_value, inplace=True)
        return group


In [None]:
# apply the function to each group
df = df.groupby('group_num', group_keys=False).apply(fillna_mode)

In [None]:
df.head(n=5)

In [None]:
df['HomePlanet'].isnull().sum()

#### Check Nulls

In [None]:
def get_null_counts(df):
    null_count_df = pd.DataFrame((df.isnull().sum()).sort_values(ascending=False).reset_index())
    null_count_df.columns = ['column_name','null_counts']
    null_count_df = null_count_df.query("column_name!='Transported'")
    return null_count_df

null_= get_null_counts(df)
null_.style.background_gradient(cmap='summer')

In [None]:
test_null = get_null_counts(test_df)
null_.style.background_gradient(cmap='winter')

Boolean values converted to 0's and 1's

In [None]:
def bool_switch(df, col_name):
    encoder = LabelEncoder()
    df[f'{col_name}'] = encoder.fit_transform(df[f'{col_name}']) 

In [None]:
bool_switch(df, col_name='Transported')
bool_switch(df, col_name='VIP')
bool_switch(df, col_name='CryoSleep')
bool_switch(df, col_name="fam_board_together")

In [None]:
df.shape

In [None]:
bool_switch(test_df, col_name='VIP')
bool_switch(test_df, col_name='CryoSleep')
bool_switch(test_df, col_name="fam_board_together")

In [None]:
test_df.shape

### Preparing Data

Declare dependant variable(y-axis)

In [None]:
#| export
dep_var = 'Transported'

In [None]:
df['Destination'].sample

Add tabular processes to transform categorical variables to something similar to `pd.Categorical`, and fill in missing/na values

In [None]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [None]:
#| export
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
test_cont, test_cat = cont_cat_split(test_df, 1, dep_var=dep_var)

In [None]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, 
                    y_block=CategoryBlock(), 
                    splits=RandomSplitter(valid_pct=0.2, seed=42)(df)).dataloaders(bs=128)

In [None]:
to.fill_strategy

In [None]:
test_to = TabularPandas(test_df, procs, test_cat, test_cont, y_names=None, 
                    y_block=CategoryBlock(), 
                    splits=None).dataloaders(bs=128)

In [None]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [None]:
test_xs = test_to.train.xs

In [None]:
xs.head()

In [None]:
xs.shape

In [None]:
test_xs.head()

In [None]:
test_xs.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
def rf(xs, y, n_estimators=40, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_features=max_features, min_samples_leaf=min_samples_leaf, 
        oob_score=True).fit(xs,y)

In [None]:
m = rf(xs,y)

In [None]:
def pred_acc(m, valid_xs=valid_xs):
    y_pred = m.predict(valid_xs)
    accuracy = accuracy_score(valid_y, y_pred)
    return accuracy

In [None]:
pred_acc(m)

In [None]:
def rf_feat_importances(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importances(m, xs)
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi)

In [None]:
cluster_columns(xs)

In [None]:
to_keep = fi[fi.imp > 0.005].cols
len(to_keep), len(fi)

In [None]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
test_xs_imp = test_xs[to_keep]

In [None]:
len(test_xs_imp)

In [None]:
m = rf(xs_imp, y)

In [None]:
pred_acc(m, valid_xs=valid_xs_imp)

In [None]:
plot_fi(rf_feat_importances(m, xs_imp))

In [None]:
cluster_columns(xs_imp)

In [None]:
def get_oob(df):
    m = RandomForestClassifier(n_estimators=40, min_samples_leaf=15, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)


In [None]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in xs_imp.columns}

In [None]:
{c:get_oob(xs.drop(c, axis=1)) for c in xs.columns}

In [None]:
xs_final = xs_imp
valid_xs_final = valid_xs_imp

In [None]:
test_xs_final = test_xs_imp

In [None]:
xs_final.head()

In [None]:
pred_acc(m, valid_xs=valid_xs_final)

In [None]:
valid_xs_final.columns

In [None]:
# test_xs = [test_xs.drop([x], axis=1) for x in test_xs if x not in valid_xs_final]

In [None]:
# for x in test_xs.columns:
#     if x not in valid_xs_final.columns:
#         test_xs = test_xs.drop([x], axis=1)

In [None]:
len(test_xs_final)

In [None]:
len(test_xs.columns), len(valid_xs_final.columns)

In [None]:
test_xs.columns, valid_xs_final.columns

In [None]:
preds = m.predict(test_xs_final)

In [None]:
preds

In [None]:
sample_df.head()

In [None]:
sample_df['Transported'] = preds.astype(bool)

In [None]:
sample_df.value_counts()

In [None]:
sub_df = sample_df

In [None]:
sub_df

In [None]:
sub_df.to_csv('submission.csv', index=False)