Titanic - Machine Learning from Disaster

Start here! Predict survival on the Titanic and get familiar with ML basics

https://www.kaggle.com/competitions/titanic/data?select=train.csv



In [352]:
# Import panda dataframe library
import pandas as pd
import numpy as np
import torch as torch
import torch.nn.functional as F

In [353]:
# Open the train and test datasets to determine which features are available in each dataset
train_df = pd.read_csv('datasets/titanic/train.csv')
test_df = pd.read_csv('datasets/titanic/test.csv')

In [354]:
# print the first 7 rows of the train dataset
train_df.head(7)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [355]:
# create target tensors
target = train_df['Survived'].values
target = torch.from_numpy(target)

In [356]:
def get_features(df):
    pclass = df['Pclass'].values
    num_train_examples = len(df)

    pgenders = df ['Sex'].values
    genders = []

    for pgen in pgenders:
        if pgen == 'male':
            genders.append(0)
        elif pgen == 'female':
            genders.append(1)

    genders = torch.tensor(genders)
    pclass = torch.tensor(pclass)
    features = torch.vstack((pclass, genders))
    features = features.view(num_train_examples, -1)

    df['Age'].fillna(df['Age'].median(), inplace=True)
    ages = df['Age'].values
    ages = torch.tensor(ages)
    ages = ages.view(num_train_examples, -1)
    features = torch.hstack((features, ages))

    sibsp = df['SibSp'].values
    sibsp = torch.tensor(sibsp)
    sibsp = sibsp.view(num_train_examples, -1)
    features = torch.hstack((features, sibsp))

    parch = df['Parch'].values
    parch = torch.tensor(parch)
    parch = parch.view(num_train_examples, -1)
    features = torch.hstack((features, parch))

    fare = df['Fare'].values
    fare = torch.tensor(fare)
    fare = fare.view(num_train_examples, -1)
    features = torch.hstack((features, fare))

    df['Embarked'].fillna('S', inplace=True)
    pembarked = df['Embarked'].values
    embarked = []

    for embark in pembarked:
        if embark == 'S':
            embarked.append(0)
        elif embark == 'C':
            embarked.append(1)
        elif embark == 'Q':
            embarked.append(2)

    embarked = torch.tensor(embarked)
    embarked = embarked.view(num_train_examples, -1)
    features = torch.hstack((features, embarked))

    return features

features = get_features(train_df)

In [383]:
# create a neural network with 1 input layer, 1 hidden layer, and 1 output layer
num_neurons = 128
num_features = features.shape[1]
output_size = 1
# add a batchnorm layer
model = torch.nn.Sequential(
    torch.nn.Linear(num_features, num_neurons),
    torch.nn.BatchNorm1d(num_neurons),
    torch.nn.Sigmoid(),
    torch.nn.Linear(num_neurons, num_neurons),
    torch.nn.BatchNorm1d(num_neurons),
    torch.nn.Sigmoid(),
    torch.nn.Linear(num_neurons, output_size),
    torch.nn.BatchNorm1d(output_size),
    torch.nn.Sigmoid(),
)

x = torch.tensor(features, dtype=torch.float32)
x = x.view(-1, num_features)
y = torch.tensor(target, dtype=torch.float32)
y = y.view(-1, 1)


  x = torch.tensor(features, dtype=torch.float32)
  y = torch.tensor(target, dtype=torch.float32)


In [385]:
# create a loss function
loss_fn = F.binary_cross_entropy

# create an optimizer
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# get a random batch of size 32 from the training data and labels
batch_size = 128

# train the model
for t in range(3000):
    # batch_indices = np.random.choice(len(x), batch_size)
    # x_batch = x[batch_indices]
    # y_batch = y[batch_indices]
    # lossi = []
    
    # Forward pass: compute predicted y by passing x to the model.
    # y_pred = model(x_batch)
    y_pred = model(x)

    # Compute and print loss.
    # loss = loss_fn(y_pred, y_batch)
    loss = loss_fn(y_pred, y)

    print(t, loss.item())
    # lossi.append(loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Iter: {t}, Loss: {loss.item()}')


0 0.5119378566741943
1 0.7735791802406311
2 0.6904968023300171
3 0.6167393326759338
4 0.5871138572692871
5 0.5825685262680054
6 0.5869064331054688
7 0.5928470492362976
8 0.5979551076889038
9 0.6016548275947571
10 0.6039842963218689
11 0.6051516532897949
12 0.6053930521011353
13 0.6049279570579529
14 0.6039498448371887
15 0.6026267409324646
16 0.6011017560958862
17 0.5994947552680969
18 0.5979025959968567
19 0.5963982939720154
20 0.5950303077697754
21 0.5938242673873901
22 0.5927823185920715
23 0.5918864011764526
24 0.5911020636558533
25 0.5903810262680054
26 0.5896672606468201
27 0.5889009833335876
28 0.5880224108695984
29 0.5869786143302917
30 0.5857258439064026
31 0.5842368602752686
32 0.5825046896934509
33 0.5805477499961853
34 0.5784153938293457
35 0.5761865377426147
36 0.5739653706550598
37 0.5718625783920288
38 0.5699670314788818
39 0.5683057904243469
40 0.5668174624443054
41 0.5653751492500305
42 0.5638894438743591
43 0.5624517798423767
44 0.5613003373146057
45 0.560363829135894

In [386]:
# put layers into eval mode (needed for batchnorm especially)
for layer in model:
  layer.training = False

In [387]:
print(x.shape)

# Calculate the accuracy of the model
with torch.no_grad():
    y_pred = model(x)
    y_pred_class = y_pred.round()
    acc = y_pred_class.eq(y).sum() / float(y.shape[0])

print(f'Accuracy: {acc.item()}')


torch.Size([891, 7])
Accuracy: 0.8092031478881836


In [388]:
len(np.where(y_pred_class == 1)[0])

316

Baseline results:
epochs = 100 - a few thousand
features: {pclass, sex}: 0.61 accuracy with no predicted survivors

features: {pclass, sex, age}: 0.63 accuracy with 40 predicted survivors (some ones found in y_pred_class)

features: {pclass, sex, age  ... fare}: 0.69 accuracy with 265 predicted survivors.  It seems that fare has a lot of influence on survival as it raised the accuracy by 6% and the number of predicted survivors by 225.

Increasing the batch size also helped.  I went from 32 to 64 and got a 1% increase in accuracy.

In [389]:
# Evaluate model on test data

# Calculate the accuracy of the model

with torch.no_grad():
    test_df_features = get_features(test_df)
    test_df_features = torch.tensor(test_df_features, dtype=torch.float32)
    test_df_features = test_df_features.view(-1, test_df_features.shape[1])

    y_pred_test = model(test_df_features)
    y_pred_test_class = y_pred_test.round()
    y_pred_test_class = y_pred_test_class.view(-1)

print("# of dead: ",len(np.where(y_pred_test_class == 0)[0]))
print("# of survivors: ",len(np.where(y_pred_test_class == 1)[0]))

# of dead:  263
# of survivors:  154


  test_df_features = torch.tensor(test_df_features, dtype=torch.float32)


In [390]:
test_predictions = torch.tensor(y_pred_test_class, dtype=torch.int32)
test_predictions.shape

# Write predictions to file
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': test_predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


  test_predictions = torch.tensor(y_pred_test_class, dtype=torch.int32)


Ideas for improving 

- create a validation set
- add dropout
- add regularization
- add more layers
- add more neurons
- add more epochs
- add more features
- add more data
- use a transformer model
