In [61]:
#|export

# Use the FastAi tabular library for the Titanic dataset
from fastai.tabular.all import *
import pandas as pd

In [62]:
#|export
df = pd.read_csv("../data/raw/train.csv")
print(df.columns)
print(df.size)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
10692


In [63]:
#|export

# Find honorifics of each passenger, and create categorical column
def get_honorific(name: str) -> str:
    first_name = name.strip().split(",")[-1]
    return first_name.strip().split(" ")[0]

In [29]:
df["Name"] = df["Name"].apply(get_honorific)
df.rename(columns={"Name": "Honorific"}, inplace=True)
df.head(n=10)

Unnamed: 0,PassengerId,Survived,Pclass,Honorific,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr.,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs.,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss.,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs.,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr.,male,35.0,0,0,373450,8.05,,S
5,6,0,3,Mr.,male,,0,0,330877,8.4583,,Q
6,7,0,1,Mr.,male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,Master.,male,2.0,3,1,349909,21.075,,S
8,9,1,3,Mrs.,female,27.0,0,2,347742,11.1333,,S
9,10,1,2,Mrs.,female,14.0,1,0,237736,30.0708,,C


In [64]:
# Find cabin types for passengers

def get_cabin_number(cabins: str):
    if pd.isna(cabins):
        return "NC"
    else:
        return cabins.split(" ")[0][0]

In [51]:
df["Cabin"] = df["Cabin"].apply(get_cabin_number)
df.rename(columns={"Cabin": "CabinType"}, inplace=True)

NC
C
NC
C
NC
NC
E
NC
NC
NC
G
C
NC
NC
NC
NC
NC
NC
NC
NC
NC
D
NC
A
NC
NC
NC
C
NC
NC
NC
B
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
D
NC
B
C
NC
NC
NC
NC
NC
B
C
NC
NC
NC
F
NC
NC
NC
NC
NC
NC
NC
NC
F
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
C
NC
NC
NC
E
NC
NC
NC
A
D
NC
NC
NC
NC
D
NC
NC
NC
NC
NC
NC
NC
C
NC
NC
NC
NC
NC
NC
NC
B
NC
NC
NC
NC
E
D
NC
NC
NC
F
NC
NC
NC
NC
NC
NC
NC
D
C
NC
B
NC
NC
NC
NC
NC
NC
NC
NC
F
NC
NC
C
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
E
NC
NC
NC
B
NC
NC
NC
A
NC
NC
C
NC
NC
NC
NC
NC
F
NC
A
NC
NC
NC
NC
NC
NC
NC
F
B
B
NC
NC
NC
NC
NC
NC
NC
NC
NC
G
NC
NC
NC
A
NC
NC
NC
NC
NC
D
NC
NC
D
NC
NC
NC
NC
NC
C
NC
NC
NC
NC
NC
C
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
NC
C
NC
NC
D
NC
NC
G
C
NC
NC
NC
NC
B
NC
NC
NC
NC
E
B
NC
NC
NC
NC
C
C
NC
NC
NC
C
NC
D
NC
NC
NC
NC
NC
NC
NC
NC
A
NC
NC
NC
NC
NC
NC
B
D
NC
NC
NC
NC
C
C
B
NC
NC
NC
E
NC
C
NC
C
NC
E
C
B
NC
NC
NC
NC
NC
NC
C
E
NC
NC
NC
NC
NC
C
NC
D
NC
B
NC
C
C
NC
NC
NC
C
E
NC
T
F
C
NC
NC
NC
F
NC
NC
NC
NC
NC
C
NC
NC
NC
NC
E
NC
NC
NC
NC
N

In [71]:
def apply_dataframe_transformations(path_to_csv: str):
    df = pd.read_csv(path_to_csv)
    
    # Replace names with honorifics
    df["Name"] = df["Name"].apply(get_honorific)
    df.rename(columns={"Name": "Honorific"}, inplace=True)
    
    # Replace cabin numbers with cabin types
    df["Cabin"] = df["Cabin"].apply(get_cabin_number)
    df.rename(columns={"Cabin": "CabinType"}, inplace=True)
    
    return df

In [72]:
df = apply_dataframe_transformations("../data/raw/train.csv")

In [73]:
categorical_features = ["Pclass", "Sex", "Honorific", "CabinType"]
continuous_features = ["Age", "SibSp", "Parch"]
y_column = "Survived"

In [74]:
data_split = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))

In [75]:
tabular_obj = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cat_names=categorical_features,
                   cont_names=continuous_features,
                   y_names=y_column,
                   y_block=CategoryBlock,
                   splits=data_split)

In [55]:
doc(TabularPandas)

In [76]:
tabular_obj.xs.iloc[:5]

Unnamed: 0,Pclass,Sex,Honorific,CabinType,Age_na,Age,SibSp,Parch
788,3,2,8,8,1,-2.182087,0.461591,2.034995
525,3,2,12,8,1,0.86192,-0.462887,-0.464491
821,3,2,12,8,1,-0.178437,-0.462887,-0.464491
253,3,2,12,8,1,0.052753,0.461591,-0.464491
374,3,1,9,8,1,-2.02796,2.310547,0.785252


In [77]:
dls = tabular_obj.dataloaders(bs=64)
dls.show_batch()

Unnamed: 0,Pclass,Sex,Honorific,CabinType,Age_na,Age,SibSp,Parch,Survived
0,2,male,Mr.,NC,True,28.0,1.689237e-09,-9.897945e-09,1
1,3,male,Mr.,NC,True,28.0,1.689237e-09,-9.897945e-09,0
2,2,female,Mrs.,NC,False,50.0,1.689237e-09,1.0,1
3,3,female,Miss.,NC,False,27.0,1.689237e-09,-9.897945e-09,1
4,2,female,Mrs.,NC,False,55.0,1.689237e-09,-9.897945e-09,1
5,3,female,Miss.,NC,True,28.0,1.689237e-09,-9.897945e-09,0
6,3,female,Miss.,NC,False,23.0,1.689237e-09,-9.897945e-09,1
7,3,male,Mr.,NC,True,28.0,1.689237e-09,-9.897945e-09,0
8,1,male,Mr.,A,False,31.0,1.689237e-09,-9.897945e-09,0
9,3,female,Miss.,NC,False,14.499999,1.0,-9.897945e-09,0


In [78]:
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.682359,0.699733,0.44382,00:00
1,0.653146,0.682764,0.494382,00:00
2,0.626231,0.616174,0.780899,00:00
3,0.583666,0.500262,0.820225,00:00
4,0.547216,0.492122,0.820225,00:00
5,0.521429,0.43508,0.808989,00:00
6,0.494242,0.435866,0.797753,00:00
7,0.477464,0.440392,0.820225,00:00
8,0.460227,0.436112,0.814607,00:00
9,0.446364,0.436278,0.814607,00:00


In [85]:
doc(Learner.get_preds)

In [97]:
test_df = apply_dataframe_transformations("../data/raw/test.csv")
passenger_ids = test_df["PassengerId"]

In [98]:
test_dls = learn.dls.test_dl(test_df)

In [99]:
_, idx = torch.max(learn.get_preds(dl=test_dls)[0], dim=1)
idx

tensor([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
        1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,

In [110]:
predicted_results = pd.DataFrame({"PassengerId": passenger_ids, "Survived": idx})
predicted_results.to_csv("submission.csv", index=False)