In [28]:
import os

# Kaggle dataset: https://www.kaggle.com/c/titanic
# Creds to pull kaggle dataset
os.environ['KAGGLE_USERNAME'] = 'xxxxxxx'
os.environ['KAGGLE_KEY'] = 'xxxxxxxx'

In [29]:
!kaggle competitions download -c titanic
!unzip titanic.zip -d titanic_dataset

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  titanic.zip
replace titanic_dataset/gender_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train_data = pd.read_csv('titanic_dataset/train.csv')
test_data = pd.read_csv('titanic_dataset/test.csv')

# One-hot encoding for the categorical data
def one_hot(data_set, columns):
    return pd.get_dummies(data_set, columns=columns, dtype=np.float32)

# Generate data about Mr, Mrs, Miss and no title
def extract_title(name):
    if 'Mr.' in name:
        return 'Mr'
    elif 'Mrs.' in name:
        return 'Mrs'
    elif 'Miss.' in name:
        return 'Miss'
    else:
        return 'Other'

train_data['Fare*Class'] = train_data['Fare'] * train_data['Pclass']
test_data['Fare*Class'] = test_data['Fare'] * test_data['Pclass']

train_data = one_hot(train_data, ['Sex', 'Pclass'])
test_data = one_hot(test_data, ['Sex', 'Pclass'])

train_data['Title'] = train_data['Name'].apply(extract_title)
test_data['Title'] = test_data['Name'].apply(extract_title)

# Generate family size by adding siblings, spouces and parent children
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Cabin letter
train_data['CabinLetter'] = train_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
test_data['CabinLetter'] = test_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')

# Per person fare
train_data['FarePerPerson'] = train_data['Fare'] / train_data['FamilySize']
test_data['FarePerPerson'] = test_data['Fare'] / test_data['FamilySize']

# Age bins to turn age number to buckets
train_data['AgeBin'] = pd.cut(train_data['Age'], bins=[0, 10, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior'])
test_data['AgeBin'] = pd.cut(test_data['Age'], bins=[0, 10, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior'])

train_data = one_hot(train_data, ['Title', 'CabinLetter', 'AgeBin'])
test_data = one_hot(test_data, ['Title', 'CabinLetter', 'AgeBin'])

train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

y = train_data['Survived']
t_id = test_data['PassengerId']

train_data = train_data.drop(columns=['PassengerId', 'Age', 'Name', 'SibSp', 'Parch', 'Cabin', 'Ticket', 'Survived', 'Embarked'])
test_data = test_data.drop(columns=['PassengerId', 'Age', 'Name', 'SibSp', 'Parch', 'Cabin', 'Ticket', 'Embarked'])
# This column never shows up in the test data
test_data['CabinLetter_T'] = 0

train_data.columns


Index(['Fare', 'Fare*Class', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'FamilySize', 'FarePerPerson', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Other', 'CabinLetter_A', 'CabinLetter_B',
       'CabinLetter_C', 'CabinLetter_D', 'CabinLetter_E', 'CabinLetter_F',
       'CabinLetter_G', 'CabinLetter_T', 'CabinLetter_U', 'AgeBin_Child',
       'AgeBin_Teenager', 'AgeBin_Adult', 'AgeBin_Middle-aged',
       'AgeBin_Senior', 'IsAlone'],
      dtype='object')

In [29]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()

# Finally, we need to normalize the non-binary columns
columns_to_normalize = ['Fare', 'FamilySize', 'FarePerPerson', 'Fare*Class']
train_data[columns_to_normalize] = scaler.fit_transform(train_data[columns_to_normalize])
test_data[columns_to_normalize] = scaler.fit_transform(test_data[columns_to_normalize])

train_data.head()

Unnamed: 0,Fare,Fare*Class,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,FamilySize,FarePerPerson,Title_Miss,...,CabinLetter_F,CabinLetter_G,CabinLetter_T,CabinLetter_U,AgeBin_Child,AgeBin_Teenager,AgeBin_Adult,AgeBin_Middle-aged,AgeBin_Senior,IsAlone
0,0.014151,0.042453,0.0,1.0,0.0,0.0,1.0,0.1,0.007076,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,0.139136,0.139136,1.0,0.0,1.0,0.0,0.0,0.1,0.069568,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,0.015469,0.046406,1.0,0.0,0.0,0.0,1.0,0.0,0.015469,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
3,0.103644,0.103644,1.0,0.0,1.0,0.0,0.0,0.1,0.051822,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,0.015713,0.047138,0.0,1.0,0.0,0.0,1.0,0.0,0.015713,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1


In [30]:
import torch

train_size = 550

X_train, y_train = torch.tensor(train_data[:train_size].values, dtype=torch.float32), \
  torch.tensor(y[:train_size].values, dtype=torch.float32).unsqueeze(1)
X_val, y_val = torch.tensor(train_data[train_size:].values, dtype=torch.float32), \
  torch.tensor(y[train_size:].values, dtype=torch.float32).unsqueeze(1)

X_test = torch.tensor(test_data.values, dtype=torch.float32)

In [43]:
import torch
import torch.nn as nn

n = len(X_train[0])

model = nn.Linear(n, 1, bias=True)
lr = 0.0001

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for t in range(50000):
    y_pred = torch.sigmoid(model(X_train))

    loss = loss_fn(y_pred, y_train)
    if t % 1000 == 0:
        with torch.no_grad():
            y_pred_test = torch.sigmoid(model(X_val))
            test_loss = loss_fn(y_pred_test, y_val)
            print(f"Epoch {t}, Training Loss: {loss.item()}, Test Loss: {test_loss}")

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 0, Training Loss: 0.7992701530456543, Test Loss: 0.8099290728569031
Epoch 1000, Training Loss: 0.7577621340751648, Test Loss: 0.7661786079406738
Epoch 2000, Training Loss: 0.7267162799835205, Test Loss: 0.7328916192054749
Epoch 3000, Training Loss: 0.7055034041404724, Test Loss: 0.7099096775054932
Epoch 4000, Training Loss: 0.6904851198196411, Test Loss: 0.6940154433250427
Epoch 5000, Training Loss: 0.6791715025901794, Test Loss: 0.6823642253875732
Epoch 6000, Training Loss: 0.67035973072052, Test Loss: 0.6734592318534851
Epoch 7000, Training Loss: 0.6632843613624573, Test Loss: 0.666422426700592
Epoch 8000, Training Loss: 0.6574632525444031, Test Loss: 0.6607682704925537
Epoch 9000, Training Loss: 0.6526130437850952, Test Loss: 0.6561776399612427
Epoch 10000, Training Loss: 0.6485416293144226, Test Loss: 0.6523760557174683
Epoch 11000, Training Loss: 0.6450958251953125, Test Loss: 0.6491701006889343
Epoch 12000, Training Loss: 0.6421522498130798, Test Loss: 0.646439790725708
Epo

In [5]:
# Let's try this on the test set and try submitting it

predictions = torch.sigmoid(model(X_test))  # Assuming 'model' is your trained model and 'X_test' is your test data
predictions = (predictions.squeeze() > 0.5).int()  # Convert to binary output

# Create a DataFrame with the required structure
submission = pd.DataFrame({
    'PassengerId': t_id,  # Replace with actual Passenger IDs from the test dataset
    'Survived': predictions.numpy()
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)