### Spaceship Titanic with fastai

Competition [Link](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In [1]:
#|default_exp app

In [2]:
#The Following cell of code is used everytime FASTAI library is used.
#They tell the notebook to reload any changes made to any libraries used.
#They also ensure that any graphs are plotted are shown in this notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### Imports

In [3]:
from fastai.tabular.all import *
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sbs
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
import dtreeviz

### Import Datasets and Create Dataframe

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('spaceship-titanic')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [8]:
#| export
if iskaggle:
    path = Path('../input/spaceship-titanic')
    ! pip install -q dataset

In [9]:
#| export
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sample_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [11]:
df['CryoSleep'] = df['CryoSleep'].map({True: '1', False: '0'}).astype(bool)
df['VIP'] = df['VIP'].map({True: '1', False: '0'}).astype(bool)
df['Transported'] = df['Transported'].map({True: '1', False: '0'}).astype(bool)

In [12]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,True,B/0/P,TRAPPIST-1e,39.0,True,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,True
1,0002_01,Earth,True,F/0/S,TRAPPIST-1e,24.0,True,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,True,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,True
3,0003_02,Europa,True,A/0/S,TRAPPIST-1e,33.0,True,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,True
4,0004_01,Earth,True,F/1/S,TRAPPIST-1e,16.0,True,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(5)
memory usage: 772.6+ KB


In [None]:
df.columns

### Preparing Data

In [None]:
#| export
dep_var = 'Transported'

In [None]:
df['Destination'].sample

In [None]:
#| export
procs = [BooleanCategorify, Categorify, FillMissing]

In [None]:
#| export
train, valid = train_test_split(df, test_size=.2, random_state=42)

In [None]:
len(train), len(valid)

In [None]:
#| export
train_idx = np.array(train.index)
valid_idx = np.array(valid.index)
train_idx, valid_idx

In [None]:
u, c = np.unique(valid_idx, return_counts=True)
dup = u[c > 1]
dup

In [None]:
#| export
splits = (list(train_idx), list(valid_idx))

In [None]:
#| export
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)

In [None]:
len(to.train), len(to.valid)

In [None]:
to.items.head()

In [None]:
save_pickle(path/'space_titan_to.pkl', to)

In [None]:
to = load_pickle(path/'space_titan_to.pkl')

In [None]:
to.items['PassengerId']

In [None]:
zero_idx_check = to.items.loc[to.items['PassengerId'] == 1]

In [None]:
zero_idx_check

In [None]:
df.head()

In [None]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

### Decision Tree Classifier

In [None]:
m = DecisionTreeClassifier(max_leaf_nodes=4)
m.fit(xs, y);

In [None]:
draw_tree(m, xs, size=6, leaves_parallel=True, precision=2)

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy

In [None]:
samp_idx = np.random.permutation(len(y))[:500]

In [None]:
viz_model = dtreeviz.model(m, X_train=xs.iloc[samp_idx], y_train=y.iloc[samp_idx], target_name=dep_var)

In [None]:
viz_model.view(fontname="DejaVu Sans", scale=1.8, label_fontsize=10)

### Logistic Regression

In [None]:
m = LogisticRegression(max_iter=1000)
m.fit(xs, y);

In [None]:
y_pred = m.predict(valid_xs)

In [None]:
accuracy = accuracy_score(valid_y, y_pred)
accuracy