### Spaceship Titanic with fastai

Competition [Link](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In [1]:
#|default_exp app

In [2]:
#The Following cell of code is used everytime FASTAI library is used.
#They tell the notebook to reload any changes made to any libraries used.
#They also ensure that any graphs are plotted are shown in this notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### Imports

In [3]:
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from IPython.display import Image, display_svg, SVG


### Import Datasets and Create Dataframe

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('spaceship-titanic')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [8]:
#| export
if iskaggle:
    path = Path('../input/spaceship-titanic')
    ! pip install -q dataset

Import CSV's as Pandas Dataframes

In [9]:
#| export
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sample_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)

In [10]:
df.head(n=15)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [12]:
df['Name'].head()

0      Maham Ofracculy
1         Juanna Vines
2        Altark Susent
3         Solam Susent
4    Willy Santantines
Name: Name, dtype: object

In [13]:
df['family_group'] = df['Name'].str.split(' ').str[1]

In [14]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,family_group
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Santantines


Dropping the name, on a small dataset it could cause overfitting. Maybe we could use it to create *family groups*, but for now we'll just go with the similar-but-not-quite-the-same *boarding_groups*

In [None]:
df = df.drop(['Name'], axis=1)

In [None]:
df['Name'].head()

In [None]:
df.index

In [None]:
df['Destination'].hist()

In [None]:
df['Age'].hist()

In [None]:
df['FoodCourt'].hist()

Looking at this there's some extra data we can extract to new columns, *PassengerId* looks like it's by group number and the number in a particular group, split with an underscore `_` 

In [None]:
df['PassengerId'].value_counts

In [None]:
df['group_num'] = df['PassengerId'].str.split('_').str[0]
df['group_size'] = df['PassengerId'].str.split('_').str[1]

The same is true for the Cabin, there are 3 different values to analyze here, so let's split them up as well

In [None]:
df['deck'] = df['Cabin'].str.split('/').str[0]
df['room_num'] = df['Cabin'].str.split('/').str[1]
df['side'] = df['Cabin'].str.split('/').str[2]

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
def get_null_counts(df):
    null_count_df = pd.DataFrame((df.isnull().sum()).sort_values(ascending=False).reset_index())
    null_count_df.columns = ['column_name','null_counts']
    null_count_df = null_count_df.query("column_name!='Transported'")
    return null_count_df

null_= get_null_counts(df)
null_.style.background_gradient(cmap='summer')

In [None]:
df['Transported'].value_counts()

Add a function to convert boolean values to 0 and 1 using `LabelEncoder` 

In [None]:
def bool_switch(df, col_name):
    encoder = LabelEncoder()
    df[f'{col_name}'] = encoder.fit_transform(df[f'{col_name}']) 

In [None]:
bool_switch(df, col_name='Transported')
bool_switch(df, col_name='VIP')
bool_switch(df, col_name='CryoSleep')


Check to see how it worked

In [None]:
df.head()

In [None]:
df['Transported'].value_counts()

In [None]:
df.info()

In [None]:
df.columns

### Preparing Data

Declare dependant variable(y-axis)

In [None]:
#| export
dep_var = 'Transported'

In [None]:
df['Destination'].sample

Add tabular processes to transform categorical variables to something similar to `pd.Categorical`, and fill in missing/na values

In [None]:
#| export
procs = [Categorify, FillMissing, Normalize]

Split our data set into a training(.8) and a validation(.2) set, set random_state to a fixed seed for consistency

In [None]:
# #| export
# train, valid = train_test_split(df, test_size=.2, random_state=42)

Use `cont_cat_split` to separate continuous and categorical variables

In [None]:
#| export
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

Load our dataframe to a `TabularPandas` object, `procs` from above is applied here. Set the dependant variable as the target

In [None]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var,  y_block=CategoryBlock(), splits=RandomSplitter(valid_pct=0.2, seed=42)(df)).dataloaders(bs=128)

In [None]:
len(to.train), len(to.valid)

Pickle our tabular object so that we can skip a few steps in the future

In [None]:
save_pickle(path/'space_titan_to.pkl', to)

Load our Pickle file

In [None]:
to = load_pickle(path/'space_titan_to.pkl')

The tabular pandas object has taken care of a few things, such as converting all our categories to numbers, filling missing values, and creating corresponding columns that indicateds whether a missing value was filled in

In [None]:
to.items.head()

In [None]:
to.items.isna().sum()

In [None]:
to.items.describe()

In [None]:
to.train.xs

In [None]:
to['FoodCourt'].hist()

In [None]:
to.items.describe()

Assign our X values and Y value for training and validation

In [None]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

### Decision Tree Classifier

Test a small tree with 4 leaf nodes

In [None]:
m = DecisionTreeClassifier(max_leaf_nodes=4)
m.fit(xs, y);

In [None]:
draw_tree(m, xs, size=6, leaves_parallel=True, precision=2)

Have the model run predictions on the validation set for who would be transported

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
y_pred

Check how accurate our predictions were against the actual validation data

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy

In [None]:
samp_idx = np.random.permutation(len(y))[:500]

In [None]:
viz_model = dtreeviz.model(m, X_train=xs.iloc[samp_idx], y_train=y.iloc[samp_idx], target_name=dep_var)

In [None]:
viz_model.view(fontname="DejaVu Sans", scale=1.8, label_fontsize=10)

In [None]:
viz_model = dtreeviz.model(m, X_train=xs, y_train=y, target_name=dep_var)

In [None]:
viz_model.view(fontname="DejaVu Sans", scale=1.8, label_fontsize=10)

### Logistic Regression

Run the same test with a quick logistic regression model

In [None]:
m = LogisticRegression(max_iter=1000)
m.fit(xs, y);

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy

### All Leaves Decision Tree 

Run the same decision tree model, let it max out the leaves until there is one for each

In [None]:
m = DecisionTreeClassifier()
m.fit(xs, y);

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy

In [None]:
m.get_n_leaves(), len(xs)

### 25 Leaves Decision Tree 

Scale back to 25 Leaves as a happy medium

In [None]:
m = DecisionTreeClassifier(min_samples_leaf=25)
m.fit(xs, y);

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy

### Random Forest

Create a function to make it easier to try several variations in succession

In [None]:
def rf(xs, y, n_estimators=40, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_features=max_features, min_samples_leaf=min_samples_leaf, 
        oob_score=True).fit(xs,y)

In [None]:
m = rf(xs,y)

In [None]:
def pred_acc(m, valid_xs=valid_xs):
    y_pred = m.predict(valid_xs)
    accuracy = accuracy_score(valid_y, y_pred)
    return accuracy
    

In [None]:
pred_acc(m)

In [None]:
preds = np.stack([t.predict(valid_xs.values) for t in m.estimators_])

In [None]:
valid_y.values

In [None]:
preds = np.stack([t.predict(valid_xs.values) for t in m.estimators_])

In [None]:
preds.shape

In [None]:
preds_std = preds.std(0)

In [None]:
preds_std[:5]

In [None]:
def rf_feat_importances(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importances(m, xs)
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi)

In [None]:
cluster_columns(xs)

### Removing Low Importance Variables

In [None]:
to_keep = fi[fi.imp > 0.005].cols
len(to_keep), len(fi)

In [None]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
m = rf(xs_imp, y)

In [None]:
pred_acc(m, valid_xs=valid_xs_imp)

In [None]:
plot_fi(rf_feat_importances(m, xs_imp))

In [None]:
cluster_columns(xs_imp)

In [None]:
def get_oob(df):
    m = RandomForestClassifier(n_estimators=40, min_samples_leaf=15, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)

In [None]:
xs_imp.columns

In [None]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in xs_imp.columns}

In [None]:
{c:get_oob(xs.drop(c, axis=1)) for c in xs.columns}

In [None]:
to_drop = ['group_num', 'deck', 'Name', 'group_size']

In [None]:
get_oob(xs_imp.drop(to_drop, axis=1))

In [None]:
to_drop = ['Name']

In [None]:
get_oob(xs_imp.drop(to_drop, axis=1))

In [None]:
get_oob(xs)

In [None]:
get_oob(xs_imp)

In [None]:
xs_final = xs_imp
valid_xs_final = valid_xs_imp

In [None]:
save_pickle(path/'xs_final.pkl', xs_final)
save_pickle(path/'valid_xs_final.pkl', valid_xs_final)

In [None]:
xs_final = load_pickle(path/'xs_final.pkl')
valid_xs_final = load_pickle(path/'valid_xs_final.pkl')

In [None]:
xs_final.head()

In [None]:
pred_acc(m, valid_xs=valid_xs_final)

### Tabular Learner Test

In [None]:
learn = tabular_learner(to, metrics=[accuracy, error_rate], layers=[170,100])

In [None]:
learn.lr_find(suggest_funcs=(slide, valley))

In [None]:
to