In [69]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn   # neural network modules
import torch.optim as optim   # optimization algorithms
import torch.nn.functional as F   # functions without parameters like activation functions
from torch.utils.data import TensorDataset, DataLoader   # dataset management, create batches
import torchvision.datasets as datasets   # standard datasets on pytorch
import torchvision.transforms as transforms   #transform datasets

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [70]:
train_dataset = pd.read_csv("train.csv")

print("Full train dataset shape is {}".format(train_dataset.shape))

Full train dataset shape is (8693, 14)


In [71]:
#examine dataset
train_dataset.head()
train_dataset.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [72]:
#drop id and name columns as they are not important
train_dataset = train_dataset.drop(['PassengerId', 'Name'], axis=1)

In [73]:
#replace nan values with 0
train_dataset.isnull().sum().sort_values(ascending=False)
train_dataset[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','HomePlanet','Destination']] = train_dataset[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','HomePlanet','Destination']].fillna(value=0)


In [74]:
#replace strings with numbers
train_dataset['HomePlanet'].unique()
train_dataset['HomePlanet'] = train_dataset['HomePlanet'].replace({'Earth':1, 'Europa':2, 'Mars':3})

train_dataset['Destination'].unique()
train_dataset['Destination'] = train_dataset['Destination'].replace({'TRAPPIST-1e':1, '55 Cancri e':2, 'PSO J318.5-22':3})

In [75]:
#replace booleans with int
target = 'Transported'
train_dataset[target] = train_dataset[target].astype(int)
train_dataset['VIP'] = train_dataset['VIP'].astype(int)
train_dataset['CryoSleep'] = train_dataset['CryoSleep'].astype(int)

In [76]:
#replace cabin number with 3 different columns
train_dataset[['Deck', 'Cabin_num', 'Side']] = train_dataset['Cabin'].str.split("/", expand=True)
train_dataset['Deck'].unique()
train_dataset['Cabin_num'].unique()
train_dataset['Cabin_num'].isnull().values.any()
train_dataset['Side'].unique()

train_dataset[['Deck', 'Cabin_num', 'Side']] = train_dataset[['Deck', 'Cabin_num', 'Side']].fillna(value=0)
train_dataset['Deck'] = train_dataset['Deck'].replace({'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8})
train_dataset['Side'] = train_dataset['Side'].replace({'P':1, 'S':2})
train_dataset['Cabin_num'] = train_dataset['Cabin_num'].astype(int)
train_dataset = train_dataset.drop('Cabin', axis=1)

In [77]:
train_dataset.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,2,0,1,39.0,0,0.0,0.0,0.0,0.0,0.0,0,2,0,1
1,1,0,1,24.0,0,109.0,9.0,25.0,549.0,44.0,1,6,0,2
2,2,0,1,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,0,2
3,2,0,1,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,2
4,1,0,1,16.0,0,303.0,70.0,151.0,565.0,2.0,1,6,1,2


In [78]:
#train and val split
train_df = train_dataset.sample(frac=0.8, random_state=123)
val_df = train_dataset.drop(train_df.index)


In [79]:
#convert to tensors
train_x = torch.tensor(train_df.drop("Transported", axis=1).values, dtype=torch.float32)
train_y = torch.tensor(train_df["Transported"].values, dtype=torch.float32)
val_x = torch.tensor(val_df.drop("Transported", axis=1).values, dtype=torch.float32)
val_y = torch.tensor(val_df["Transported"].values, dtype=torch.float32)

train_dataset = TensorDataset(train_x, train_y)
val_dataset = TensorDataset(val_x, val_y)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [80]:
# hyperparameters
input_size = train_x.shape[1]
num_classes = 1 #if probability greater than 0.5, then True, if less then False
learning_rate = 0.001  
num_epochs = 300

In [81]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN,self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100,50)
        self.fc3 = nn.Linear(50, num_classes)
        
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NN(input_size=input_size, num_classes=num_classes).to(device)

criterion = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)  

In [83]:
for epoch in range(num_epochs):   

    for batch_x, batch_y in train_loader:   
        batch_x = batch_x.to(device=device)
        batch_y = batch_y.to(device=device) 
        outputs = model(batch_x) 
        loss = criterion(outputs.squeeze(),batch_y) 
        optimizer.zero_grad()   
        loss.backward()
        optimizer.step()

Epoch 1/300, Loss: nan, Accuracy: 0.4961
Epoch 2/300, Loss: nan, Accuracy: 0.4957
Epoch 3/300, Loss: nan, Accuracy: 0.4957
Epoch 4/300, Loss: nan, Accuracy: 0.4957
Epoch 5/300, Loss: nan, Accuracy: 0.4957
Epoch 6/300, Loss: nan, Accuracy: 0.4957
Epoch 7/300, Loss: nan, Accuracy: 0.4957
Epoch 8/300, Loss: nan, Accuracy: 0.4957
Epoch 9/300, Loss: nan, Accuracy: 0.4957
Epoch 10/300, Loss: nan, Accuracy: 0.4957
Epoch 11/300, Loss: nan, Accuracy: 0.4957
Epoch 12/300, Loss: nan, Accuracy: 0.4957
Epoch 13/300, Loss: nan, Accuracy: 0.4957
Epoch 14/300, Loss: nan, Accuracy: 0.4957
Epoch 15/300, Loss: nan, Accuracy: 0.4957
Epoch 16/300, Loss: nan, Accuracy: 0.4957
Epoch 17/300, Loss: nan, Accuracy: 0.4957
Epoch 18/300, Loss: nan, Accuracy: 0.4957
Epoch 19/300, Loss: nan, Accuracy: 0.4957
Epoch 20/300, Loss: nan, Accuracy: 0.4957
Epoch 21/300, Loss: nan, Accuracy: 0.4957
Epoch 22/300, Loss: nan, Accuracy: 0.4957
Epoch 23/300, Loss: nan, Accuracy: 0.4957
Epoch 24/300, Loss: nan, Accuracy: 0.4957
E

In [84]:
def check_accuracy(loader,model): 
    if loader == train_loader: 
        print("checking accuracy on training data")
    else:
        print("checking accuracy on test data")
    num_correct = 0
    num_samples = 0
    model.eval() 
    
    with torch.no_grad(): 
        for x,y in loader:
            x= x.to(device=device)
            y= y.to(device=device)
            
            outputs = model(x)
            probs = torch.sigmoid(outputs).squeeze() #maps any input value to a probability value between 0 and 1
            preds = (probs > 0.5).to(torch.float32) #applies a threshold of 0.5 to the predicted probabilities, then converts boolean to float
            
            num_correct += (preds == y).sum()
            num_samples += x.shape[0]
            
        print(f'{num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')   #2dp
        
    model.train()

In [85]:
check_accuracy(train_loader,model)
check_accuracy(val_loader,model)

checking accuracy on training data
3447 / 6954 with accuracy 49.57
checking accuracy on test data
868 / 1739 with accuracy 49.91
