Titanic - Machine Learning from Disaster

Start here! Predict survival on the Titanic and get familiar with ML basics

https://www.kaggle.com/competitions/titanic/data?select=train.csv



In [97]:
# Import panda dataframe library
import pandas as pd
import numpy as np
import torch as torch
import torch.nn.functional as F

In [98]:
# Open the train and test datasets to determine which features are available in each dataset
train_df = pd.read_csv('datasets/titanic/train.csv')
test_df = pd.read_csv('datasets/titanic/test.csv')

In [99]:
# print the first 7 rows of the train dataset
train_df.head(7)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [100]:
# create target tensors
target = train_df['Survived'].values
target = torch.from_numpy(target)

In [101]:
def get_features(df):
    pclass = df['Pclass'].values
    num_train_examples = len(df)

    pgenders = df ['Sex'].values
    genders = []

    for pgen in pgenders:
        if pgen == 'male':
            genders.append(0)
        elif pgen == 'female':
            genders.append(1)

    genders = torch.tensor(genders)
    genders = genders.view(num_train_examples, -1)
    pclass = torch.tensor(pclass)
    pclass = pclass.view(num_train_examples, -1)
    features = torch.hstack((pclass, genders))
    features = features.view(num_train_examples, -1)

    df['Age'].fillna(df['Age'].median(), inplace=True)
    ages = df['Age'].values
    ages = torch.tensor(ages)
    ages = ages.view(num_train_examples, -1)
    features = torch.hstack((features, ages))

    sibsp = df['SibSp'].values
    sibsp = torch.tensor(sibsp)
    sibsp = sibsp.view(num_train_examples, -1)
    features = torch.hstack((features, sibsp))

    parch = df['Parch'].values
    parch = torch.tensor(parch)
    parch = parch.view(num_train_examples, -1)
    features = torch.hstack((features, parch))

    fare = df['Fare'].values
    fare = torch.tensor(fare)
    fare = fare.view(num_train_examples, -1)
    features = torch.hstack((features, fare))

    df['Embarked'].fillna('S', inplace=True)
    pembarked = df['Embarked'].values
    embarked = []

    for embark in pembarked:
        if embark == 'S':
            embarked.append(0)
        elif embark == 'C':
            embarked.append(1)
        elif embark == 'Q':
            embarked.append(2)

    embarked = torch.tensor(embarked)
    embarked = embarked.view(num_train_examples, -1)
    features = torch.hstack((features, embarked))

    print(ages.shape, genders.shape, fare.shape)
    important_features = torch.hstack((ages, genders, fare))
    
    return important_features

    return features

features = get_features(train_df)



# create a train and validation set
train_features = features[:700]
train_target = target[:700]

val_features = features[700:]
val_target = target[700:]


torch.Size([891, 1]) torch.Size([891, 1]) torch.Size([891, 1])


In [102]:
# create a neural network with 1 input layer, 1 hidden layer, and 1 output layer
num_neurons = 32
num_features = features.shape[1]
output_size = 1

model = torch.nn.Sequential(
    torch.nn.Linear(num_features, num_neurons),
    torch.nn.BatchNorm1d(num_neurons),
    torch.nn.Sigmoid(),
    torch.nn.Linear(num_neurons, output_size),
    torch.nn.BatchNorm1d(output_size),
    torch.nn.Sigmoid(),
)

# model = torch.nn.Sequential(
#     torch.nn.Linear(num_features, output_size),
#     # torch.nn.BatchNorm1d(output_size),
#     torch.nn.Sigmoid(),
# )

# train the model on the entire dataset for production
x = torch.tensor(features, dtype=torch.float32)
x = x.view(-1, num_features)
y = torch.tensor(target, dtype=torch.float32)
y = y.view(-1, 1)

# training set
# x = torch.tensor(train_features, dtype=torch.float32)
# x = x.view(-1, num_features)
# y = torch.tensor(train_target, dtype=torch.float32)
# y = y.view(-1, 1)


  x = torch.tensor(features, dtype=torch.float32)
  y = torch.tensor(target, dtype=torch.float32)


In [103]:
# create a loss function
loss_fn = F.binary_cross_entropy

# create an optimizer
learning_rate = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

num_epochs = 10000

# train the model
for t in range(num_epochs):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if int(t == num_epochs * .75):
        learning_rate = 1e-4
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        print(f'Adjusted Learning rate: {learning_rate}')
    if t % 1000 == 0:
        print(f'Iter: {t}, Loss: {loss.item()}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Iter: {t}, Loss: {loss.item()}')


Iter: 0, Loss: 0.6529161334037781
Iter: 1000, Loss: 0.46219661831855774
Iter: 2000, Loss: 0.4548664689064026
Iter: 3000, Loss: 0.4528019428253174
Iter: 4000, Loss: 0.44922128319740295
Iter: 5000, Loss: 0.4462409019470215
Iter: 6000, Loss: 0.4439260959625244
Iter: 7000, Loss: 0.44369086623191833
Adjusted Learning rate: 0.0001
Iter: 8000, Loss: 0.4408860206604004
Iter: 9000, Loss: 0.4402076005935669
Iter: 9999, Loss: 0.4395637512207031


In [104]:
# put layers into eval mode (needed for batchnorm especially)
for layer in model:
  layer.training = False

In [105]:
# Evaluate the model on the validation set
x = torch.tensor(val_features, dtype=torch.float32)
x = x.view(-1, num_features)
y = torch.tensor(val_target, dtype=torch.float32)
y = y.view(-1, 1)

# calculate the accuracy of the model on the validation set
with torch.no_grad():
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    y_pred_class = y_pred.round()
    acc = y_pred_class.eq(y).sum() / float(y.shape[0])

print(f'Validation accuracy: {acc:.2f}')


Validation accuracy: 0.82


  x = torch.tensor(val_features, dtype=torch.float32)
  y = torch.tensor(val_target, dtype=torch.float32)


In [106]:
# Calculate the accuracy of the model
with torch.no_grad():
    y_pred = model(x)
    y_pred_class = y_pred.round()
    acc = y_pred_class.eq(y).sum() / float(y.shape[0])

print(f'Training Accuracy: {acc.item()}')


Training Accuracy: 0.8219895362854004


In [107]:
len(np.where(y_pred_class == 1)[0])

69

Baseline results:
epochs = 100 - a few thousand
features: {pclass, sex}: 0.61 accuracy with no predicted survivors

features: {pclass, sex, age}: 0.63 accuracy with 40 predicted survivors (some ones found in y_pred_class)

features: {pclass, sex, age  ... fare}: 0.69 accuracy with 265 predicted survivors.  It seems that fare has a lot of influence on survival as it raised the accuracy by 6% and the number of predicted survivors by 225.

Increasing the batch size also helped.  I went from 32 to 64 and got a 1% increase in accuracy.

In [108]:
# Evaluate model on test data

# Calculate the accuracy of the model

with torch.no_grad():
    test_df_features = get_features(test_df)
    test_df_features = torch.tensor(test_df_features, dtype=torch.float32)
    test_df_features = test_df_features.view(-1, test_df_features.shape[1])

    y_pred_test = model(test_df_features)
    y_pred_test_class = y_pred_test.round()
    y_pred_test_class = y_pred_test_class.view(-1)

print("# of dead: ",len(np.where(y_pred_test_class == 0)[0]))
print("# of survivors: ",len(np.where(y_pred_test_class == 1)[0]))

torch.Size([418, 1]) torch.Size([418, 1]) torch.Size([418, 1])
# of dead:  262
# of survivors:  155


  test_df_features = torch.tensor(test_df_features, dtype=torch.float32)


In [109]:
test_predictions = torch.tensor(y_pred_test_class, dtype=torch.int32)
test_predictions.shape

# Write predictions to file
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': test_predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


  test_predictions = torch.tensor(y_pred_test_class, dtype=torch.int32)


Ideas for improving 
- add dropout
- add regularization
- add more layers
- add more neurons
- add more epochs
- add more features
- add more data
- use a transformer model

Done:
- created a validation set
