In [3]:
#|export

# Use the FastAi tabular library for the Titanic dataset
from fastai.tabular.all import *
import pandas as pd

In [8]:
#|export
df = pd.read_csv("../datasets/train.csv")
print(df.columns)
print(df.size)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
10692


In [9]:
#|export

# Find honorifics of each passenger, and create categorical column
def get_honorific(name: str) -> str:
    first_name = name.strip().split(",")[-1]
    return first_name.strip().split(" ")[0]

In [12]:
df["Name"] = df["Name"].apply(get_honorific)
df.rename(columns={"Name": "Honorific"}, inplace=True)
df.head(n=10)

Unnamed: 0,PassengerId,Survived,Pclass,Honorific,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr.,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs.,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss.,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs.,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr.,male,35.0,0,0,373450,8.05,,S
5,6,0,3,Mr.,male,,0,0,330877,8.4583,,Q
6,7,0,1,Mr.,male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,Master.,male,2.0,3,1,349909,21.075,,S
8,9,1,3,Mrs.,female,27.0,0,2,347742,11.1333,,S
9,10,1,2,Mrs.,female,14.0,1,0,237736,30.0708,,C


In [13]:
categorical_features = ["Pclass", "Sex", "Embarked", "Honorific"]
continuous_features = ["Age", "SibSp", "Parch", "Fare"]
y_column = "Survived"

In [14]:
data_split = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))

In [15]:
tabular_obj = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cat_names=categorical_features,
                   cont_names=continuous_features,
                   y_names=y_column,
                   y_block=CategoryBlock,
                   splits=data_split)

In [16]:
doc(TabularPandas)

In [17]:
tabular_obj.xs.iloc[:5]

Unnamed: 0,Pclass,Sex,Embarked,Honorific,Age_na,Age,SibSp,Parch,Fare
788,3,2,3,8,1,-2.182087,0.461591,2.034995,-0.226999
525,3,2,2,12,1,0.86192,-0.462887,-0.464491,-0.477002
821,3,2,3,12,1,-0.178437,-0.462887,-0.464491,-0.459214
253,3,2,3,12,1,0.052753,0.461591,-0.464491,-0.314232
374,3,1,3,9,1,-2.02796,2.310547,0.785252,-0.217253


In [88]:
dls = tabular_obj.dataloaders(bs=64)
dls.show_batch()

Unnamed: 0,Pclass,Sex,Embarked,Name,Age_na,Age,SibSp,Parch,Fare,Survived
0,2,male,S,Mr.,True,28.0,1.689237e-09,-9.897945e-09,7.312175e-07,0
1,1,male,S,Mr.,False,31.0,1.0,-9.897945e-09,52.0,0
2,2,male,S,Mr.,False,33.0,1.689237e-09,-9.897945e-09,12.275,0
3,2,female,S,Miss.,False,8.0,1.689237e-09,2.0,26.25,1
4,1,female,S,Mrs.,False,35.0,1.0,-9.897945e-09,90.0,1
5,1,female,S,Mrs.,False,45.000001,1.0,1.0,164.8667,1
6,3,male,S,Mr.,True,28.0,1.689237e-09,-9.897945e-09,8.049999,0
7,3,male,S,Mr.,False,30.0,1.0,-9.897945e-09,16.1,0
8,1,male,S,Mr.,True,28.0,1.689237e-09,-9.897945e-09,50.0,0
9,3,male,S,Mr.,True,28.0,1.689237e-09,-9.897945e-09,7.775,1


In [89]:
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.760395,0.699174,0.404494,00:00
1,0.690975,0.651279,0.488764,00:00
2,0.644063,0.601125,0.741573,00:00
3,0.604146,0.505785,0.825843,00:00
4,0.568561,0.443161,0.842697,00:00
5,0.538467,0.422935,0.831461,00:00
6,0.512241,0.435957,0.831461,00:00
7,0.492779,0.42675,0.820225,00:00
8,0.475763,0.422791,0.820225,00:00
9,0.462337,0.421396,0.820225,00:00


In [18]:
doc(tabular_learner)