# Homework 3: Recreating Titanic Survival Model in Python

In [1]:
import fastai
from fastai.vision.all import *
import pandas as pd
import numpy as np

## Loading in the data

In [2]:
df_train = pd.read_csv("./titanic/train.csv")
df_test = pd.read_csv("./titanic/test.csv")

Let's get a picture of what the data looks like

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Get descriptive statistics of the numeric fields

In [4]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Clean the data

In [5]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

For the purposes of this exercise, we don't care about the cabin, name, ticket number, or id of the passenger so we will drop those columns

In [6]:
df_train.drop(["Name", "PassengerId", "Cabin", "Ticket"], axis=1, inplace=True)
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


To handle the NAs found in the Age and Embarked columns, we will just drop the rows with missing values. I acknowledge that there are other methods to sample rows or add median values to avoid dropping data but the purpose of this notebook is simply to practice creating neural networks to make predictions so we will just drop the missing data.

In [7]:
df_train.dropna(inplace=True)

In [8]:
df_train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [9]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


We will now normalize the age by dividing it by the max age value in the dataset to get age as a percentile.

In [10]:
df_train["Age_N"] = df_train["Age"] / df_train["Age"].max()

Since fare doesn't increase linearly, we will take the log of Fare to create a more even distribution. If fare is 0, this will cause issues with log so we will remove these columns

In [11]:
df_train.drop(df_train[df_train["Fare"]==0].index, inplace=True)

In [12]:
df_train["logFare"] = np.log10(df_train["Fare"])

Lastly, we need to create dummy variables for our categorical variables embarked, sex, and Pclass

In [13]:
df_train["isMale"] = (df_train["Sex"] == "male").astype(int)

In [14]:
pclass_dummies = pd.get_dummies(df_train["Pclass"], drop_first=True, prefix="Pclass")
embarked_dummies = pd.get_dummies(df_train["Embarked"], prefix="Embarked", drop_first=True)
df_train = pd.concat([df_train, pclass_dummies, embarked_dummies], axis=1)

Now, let's drop all the rows of the old columns that are no longer in use.

In [15]:
df_train.drop(["Pclass", "Sex", "Age", "Fare", "Embarked"], axis=1, inplace=True)

We need to add a column of ones as a trick for adding the constant value.

In [16]:
df_train["Ones"] = 1

Let's pull out survived from the main df

In [17]:
y_train = df_train.pop("Survived")

In [18]:
df_train

Unnamed: 0,SibSp,Parch,Age_N,logFare,isMale,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Ones
0,1,0,0.2750,0.860338,1,0,1,0,1,1
1,1,0,0.4750,1.852988,0,0,0,0,0,1
2,0,0,0.3250,0.898999,0,0,1,0,1,1
3,1,0,0.4375,1.725095,0,0,0,0,1,1
4,0,0,0.4375,0.905796,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...
885,0,5,0.4875,1.464266,0,0,1,1,0,1
886,0,0,0.3375,1.113943,1,1,0,0,1,1
887,0,0,0.2375,1.477121,0,0,0,0,1,1
889,0,0,0.3250,1.477121,1,0,0,0,0,1


Our data looks ready for modeling. Let's begin

## Modeling: Regression
We will start simple with a linear regression

We need to define our initial parameters

In [252]:
params = torch.randn(df_train.shape[1]).reshape(-1,1).requires_grad_()
params, params.shape

(tensor([[-0.8599],
         [-0.5173],
         [-1.0287],
         [-0.3223],
         [-0.4777],
         [ 0.4437],
         [-0.9848],
         [-0.3214],
         [ 0.8510],
         [ 0.4730]], requires_grad=True),
 torch.Size([10, 1]))

In [253]:
train = torch.tensor(df_train.values).float()
train.shape

torch.Size([705, 10])

In [254]:
y = torch.tensor(y_train.values).reshape(-1,1).float()
y.shape

torch.Size([705, 1])

In [255]:
preds = train.matmul(params)

In [256]:
loss = F.mse_loss(preds, y)
loss

tensor(3.7665, grad_fn=<MseLossBackward0>)

In [257]:
loss.backward()
params.grad

tensor([[-3.3026],
        [-2.5616],
        [-0.9630],
        [-4.1213],
        [-1.6446],
        [-0.2181],
        [-1.8191],
        [-0.2211],
        [-1.8807],
        [-2.8865]])

In [258]:
params.data -= params.grad.data*.1
params.grad = None
params

tensor([[-0.5296],
        [-0.2612],
        [-0.9324],
        [ 0.0898],
        [-0.3132],
        [ 0.4655],
        [-0.8029],
        [-0.2993],
        [ 1.0391],
        [ 0.7617]], requires_grad=True)

In [259]:
def validate_epoch(x,y,params):
    preds = x.matmul(params).sigmoid()
    correct = (preds>0.5) == y
    return correct.float().mean()

Now that we've completed one step of gradient descent, let's create a function that we can loop to train more epochs.

In [264]:
def one_step(data, y, params, lr, prn=True):
    pred = data.matmul(params)
    loss = F.mse_loss(pred, y)
    loss.backward()
    with torch.no_grad():
        params -= params.grad * lr
        params.grad = None
    if prn: print(loss.item())
    return pred

Now let's loop this for some training epochs

In [373]:
n_epochs = 10
for i in range(n_epochs):
    one_step(train,y,params,0.01)

0.14759643375873566
0.14759184420108795
0.14758725464344025
0.14758270978927612
0.1475781500339508
0.14757360517978668
0.14756907522678375
0.14756454527378082
0.1475600153207779
0.14755551517009735


Let's now calculate the accuracy of our model.

In [384]:
preds = train.matmul(params)
((preds < 0) == y).float().mean()

tensor(0.5390)

Now let's try creating a neural net instead. The difference is that above we are simply doing a regression and with a neural net there need to be multiple layers. We also need an activation function between the layers, otherwise it is still just one more complicated linear function and thus regression.

### Neural Net

In [390]:
nn_params1 = torch.randn(train.shape[1]).requires_grad_()
nn_params2 = torch.randn(train.shape[1]).requires_grad_()
nn_params1, nn_params2

(tensor([ 0.1044, -0.0250,  0.5872,  0.5903, -0.3698, -0.4453,  0.7028,  1.5119,
          0.4524, -0.2436], requires_grad=True),
 tensor([-2.2570, -0.0250, -1.3394, -0.1588, -1.3354,  1.3935,  0.7422,  0.0720,
         -0.6119, -0.3452], requires_grad=True))

In [397]:
def step_nn(data,y,p1,p2,lr):
    relu1 = data.matmul(p1).clip(min=0)
    relu2 = data.matmul(p2).clip(min=0)
    pred = (relu1 + relu2).reshape(-1,1)
    loss = F.mse_loss(pred, y)
    loss.backward()
    with torch.no_grad():
        p1 -= p1.grad * lr
        p1.grad = None
        p2 -= p2.grad * lr
        p2.grad = None
    print(loss.item())
    return pred

In [400]:
nn_epochs = 20
for i in range(nn_epochs):
    step_nn(train,y,nn_params1,nn_params2,0.01)

0.350899875164032
0.3499789834022522
0.34905850887298584
0.34813618659973145
0.3472136855125427
0.3463018238544464
0.34540051221847534
0.3445095121860504
0.34362417459487915
0.3427416980266571
0.34185707569122314
0.34096217155456543
0.34007716178894043
0.3392018973827362
0.3383362293243408
0.3374800384044647
0.33663323521614075
0.33579567074775696
0.334964781999588
0.334123432636261


Check accuracy of neural net

In [402]:
preds = train.matmul(nn_params1).clip(0) + train.matmul(nn_params2).clip(0)
((preds < 0) == y).float().mean()

tensor(0.5929)

This accuracy is slightly better than our linear regression above which is as expected. However, we are still less than 60% accurate. Note: we are running these accuracy calculations on the training set which is a bad practice. This is simply because this notebook is just for practice and I decided to skip reperforming the data mutations on the test set.