## Introduction and set up

In [1]:
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    competition = 'titanic'
    path = Path(f'/root/{competition}')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(competition, path=path)
        zipfile.ZipFile(f'{path}/titanic.zip').extractall(path)

Downloading titanic.zip to /root/titanic


100%|██████████| 34.1k/34.1k [00:00<00:00, 4.05MB/s]







In [2]:
from fastai.tabular.all import *

pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

## Prep the data

In [3]:
df = pd.read_csv(path/'train.csv')

In [4]:
df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [5]:
dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master")

{'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master'}

In [6]:
df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     NaN
887    Miss
888    Miss
889      Mr
890      Mr
Name: Title, Length: 891, dtype: object

In [7]:
def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC", B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==1
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master")).value_counts(dropna=False)

add_features(df)

In [8]:
df[~df["Title"].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,LogFare,Deck,Family,Alone,TicketFreq


In [9]:
splits = RandomSplitter(seed=42)(df)

In [10]:
df['Title'].unique()

array([nan])

In [11]:
dls = TabularPandas(
    df, splits=splits,
    procs = [Categorify, FillMissing, Normalize],
    cat_names=["Sex","Pclass","Embarked","Deck", "Title"],
    cont_names=['Age', 'SibSp', 'Parch', 'LogFare', 'Alone', 'TicketFreq', 'Family'],
    y_names="Survived", y_block = CategoryBlock(),
).dataloaders(path=".")

## Train the model

In [13]:
learn = tabular_learner(dls, metrics=accuracy, layers=[10,10])

In [14]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(3, 3)
    (1): Embedding(4, 3)
    (2): Embedding(4, 3)
    (3): Embedding(4, 3)
    (4): Embedding(1, 2)
    (5): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=24, out_features=10, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=10, out_features=10, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=10, out_features=2, bias=True)
    )
  )
)

In [15]:
learn.n_emb

17

In [16]:
len(dls.cont_names) + 5*3 + 2

24

In [21]:
doc(get_emb_sz)

In [22]:
dls.cat_names

(#6) ['Sex','Pclass','Embarked','Deck','Title','Age_na']

In [23]:
dls.classes

{'Sex': ['#na#', 'female', 'male'],
 'Pclass': ['#na#', 1, 2, 3],
 'Embarked': ['#na#', 'C', 'Q', 'S'],
 'Deck': ['#na#', 'ABC', 'DE', 'FG'],
 'Title': ['#na#'],
 'Age_na': ['#na#', False, True]}

In [20]:
get_emb_sz(dls.train_ds)

[(3, 3), (4, 3), (4, 3), (4, 3), (1, 2), (3, 3)]

In [None]:
learn.lr_find(suggest_funcs=(slide, valley))

In [None]:
learn.fit(16, lr=0.03)

## Submit to Kaggle

In [None]:
tst_df = pd.read_csv(path/'test.csv')
tst_df['Fare'] = tst_df.Fare.fillna(0)
add_features(tst_df)

In [None]:
tst_dl = learn.dls.test_dl(tst_df)

In [None]:
preds,_ = learn.get_preds(dl=tst_dl)

In [None]:
tst_df['Survived'] = (preds[:,1]>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('sub.csv', index=False)

In [None]:
!head sub.csv

## Ensembling

In [None]:
def ensemble():
    learn = tabular_learner(dls, metrics=accuracy, layers=[10,10])
    with learn.no_bar(),learn.no_logging(): learn.fit(16, lr=0.03)
    return learn.get_preds(dl=tst_dl)[0]

In [None]:
learns = [ensemble() for _ in range(5)]

In [None]:
ens_preds = torch.stack(learns).mean(0)

In [None]:
tst_df['Survived'] = (ens_preds[:,1]>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('ens_sub.csv', index=False)

## Final thoughts