Titanic - Machine Learning from Disaster

Start here! Predict survival on the Titanic and get familiar with ML basics

https://www.kaggle.com/competitions/titanic/data?select=train.csv



In [52]:
# Import panda dataframe library
import pandas as pd
import numpy as np
import torch as torch
import torch.nn.functional as F

In [2]:
# Open the train and test datasets to determine which features are available in each dataset
train_df = pd.read_csv('datasets/titanic/train.csv')
test_df = pd.read_csv('datasets/titanic/test.csv')

In [3]:
# print the first 7 rows of the train dataset
train_df.head(7)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [155]:
# create target and features numpy arrays
target = train_df['Survived'].values
target = torch.from_numpy(target)

pclass = train_df['Pclass'].values

num_train_examples = len(train_df)

pgenders = train_df ['Sex'].values
genders = []

# convert the sex type to a tensor
for pgen in pgenders:
    if pgen == 'male':
        genders.append(0)
    elif pgen == 'female':
        genders.append(1)

genders = torch.tensor(genders)
pclass = torch.tensor(pclass)
features = torch.vstack((pclass, genders))
features = features.view(num_train_examples, -1)


train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
ages = train_df['Age'].values
ages = torch.tensor(ages)
ages = ages.view(num_train_examples, -1)
features = torch.hstack((features, ages))

sibsp = train_df['SibSp'].values
sibsp = torch.tensor(sibsp)
sibsp = sibsp.view(num_train_examples, -1)
features = torch.hstack((features, sibsp))

parch = train_df['Parch'].values
parch = torch.tensor(parch)
parch = parch.view(num_train_examples, -1)
features = torch.hstack((features, parch))

fare = train_df['Fare'].values
fare = torch.tensor(fare)
fare = fare.view(num_train_examples, -1)
features = torch.hstack((features, fare))

train_df['Embarked'].fillna('S', inplace=True)
pembarked = train_df['Embarked'].values
embarked = []

for embark in pembarked:
    if embark == 'S':
        embarked.append(0)
    elif embark == 'C':
        embarked.append(1)
    elif embark == 'Q':
        embarked.append(2)

# plot the embarked data
# import seaborn as sns
# import matplotlib.pyplot as plt
# sns.countplot(x='Embarked', data=train_df, palette='Set2')
# plt.show()

embarked = torch.tensor(embarked)
embarked = embarked.view(num_train_examples, -1)
features = torch.hstack((features, embarked))


features.shape

# train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
# train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)



torch.Size([891, 7])

In [160]:
# create a neural network with 1 input layer, 1 hidden layer, and 1 output layer
num_neurons = 10
num_features = features.shape[1]
input_size = features.shape[0]
output_size = 1
model = torch.nn.Sequential(
    torch.nn.Linear(num_features, num_neurons),
    torch.nn.Sigmoid(),
    torch.nn.Linear(num_neurons, output_size),
    torch.nn.Sigmoid(),
)

x = torch.tensor(features, dtype=torch.float32)
x = x.view(-1, num_features)
y = torch.tensor(target, dtype=torch.float32)
y = y.view(-1, 1)


  x = torch.tensor(features, dtype=torch.float32)
  y = torch.tensor(target, dtype=torch.float32)


In [161]:
# create a loss function
loss_fn = F.binary_cross_entropy

# create an optimizer
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# get a random batch of size 32 from the training data and labels
batch_size = 64

# train the model
for t in range(5000):
    batch_indices = np.random.choice(len(x), batch_size)
    x_batch = x[batch_indices]
    y_batch = y[batch_indices]
    lossi = []
    
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x_batch)

    # Compute and print loss.
    loss = loss_fn(y_pred, y_batch)
    print(t, loss.item())
    lossi.append(loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

print(f'Iter: {t}, Loss: {loss.item()}')


0 0.8314547538757324
1 0.8111908435821533
2 0.8433237075805664
3 0.7981516122817993
4 0.7770329117774963
5 0.8031013011932373
6 0.7839615941047668
7 0.7370577454566956
8 0.8025498986244202
9 0.8365551233291626
10 0.776642918586731
11 0.7743354439735413
12 0.7951095700263977
13 0.8296754360198975
14 0.7518041729927063
15 0.7571723461151123
16 0.7918925285339355
17 0.7587400078773499
18 0.752140998840332
19 0.7912794947624207
20 0.7490035891532898
21 0.7683532238006592
22 0.7623165845870972
23 0.7598690986633301
24 0.7414995431900024
25 0.756745457649231
26 0.7729028463363647
27 0.7649188041687012
28 0.7482271790504456
29 0.7333580851554871
30 0.7359834313392639
31 0.7374987006187439
32 0.745201826095581
33 0.7684834003448486
34 0.740414023399353
35 0.7267317771911621
36 0.7498468160629272
37 0.7603021860122681
38 0.7175865769386292
39 0.7195658087730408
40 0.7276151776313782
41 0.7582566142082214
42 0.7469895482063293
43 0.7273945212364197
44 0.7274839878082275
45 0.7412418723106384
46 

In [162]:
# Calculate the accuracy of the model
with torch.no_grad():
    y_pred = model(x)
    y_pred_class = y_pred.round()
    acc = y_pred_class.eq(y).sum() / float(y.shape[0])

print(f'Accuracy: {acc.item()}')


Accuracy: 0.7306397557258606


In [159]:
len(np.where(y_pred_class == 1)[0])

258

Baseline results:
epochs = 100 - a few thousand
features: {pclass, sex}: 0.61 accuracy with no predicted survivors

features: {pclass, sex, age}: 0.63 accuracy with 40 predicted survivors (some ones found in y_pred_class)

features: {pclass, sex, age  ... fare}: 0.69 accuracy with 265 predicted survivors.  It seems that fare has a lot of influence on survival as it raised the accuracy by 6% and the number of predicted survivors by 225.

Increasing the batch size also helped.  I went from 32 to 64 and got a 1% increase in accuracy.

In [None]:
# Evaluate model on test data
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
ages = test_df['Age'].values
ages = torch.tensor(ages)
ages = ages.view(len(test_df), -1)
