In [150]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn   # neural network modules
import torch.optim as optim   # optimization algorithms
import torch.nn.functional as F   # functions without parameters like activation functions
from torch.utils.data import TensorDataset, DataLoader   # dataset management, create batches
import torchvision.datasets as datasets   # standard datasets on pytorch
import torchvision.transforms as transforms   #transform datasets

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [151]:
train_dataset = pd.read_csv("train.csv")

print("Full train dataset shape is {}".format(train_dataset.shape))

Full train dataset shape is (8693, 14)


In [152]:
#examine dataset
train_dataset.head()
train_dataset.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [153]:
#drop id and name columns as they are not important
train_dataset = train_dataset.drop(['PassengerId', 'Name'], axis=1)

In [154]:
#replace nan values with 0
train_dataset.isnull().sum().sort_values(ascending=False)
train_dataset[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','HomePlanet','Destination']] = train_dataset[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','HomePlanet','Destination']].fillna(value=0)


In [155]:
#replace strings with numbers
train_dataset['HomePlanet'].unique()
train_dataset['HomePlanet'] = train_dataset['HomePlanet'].replace({'Earth':1, 'Europa':2, 'Mars':3})

train_dataset['Destination'].unique()
train_dataset['Destination'] = train_dataset['Destination'].replace({'TRAPPIST-1e':1, '55 Cancri e':2, 'PSO J318.5-22':3})

In [156]:
#replace booleans with int
target = 'Transported'
train_dataset[target] = train_dataset[target].astype(int)
train_dataset['VIP'] = train_dataset['VIP'].astype(int)
train_dataset['CryoSleep'] = train_dataset['CryoSleep'].astype(int)

In [157]:
#replace cabin number with 3 different columns
train_dataset[['Deck', 'Cabin_num', 'Side']] = train_dataset['Cabin'].str.split("/", expand=True)
train_dataset['Deck'].unique()
train_dataset['Cabin_num'].unique()
train_dataset['Cabin_num'].isnull().values.any()
train_dataset['Side'].unique()

train_dataset[['Deck', 'Cabin_num', 'Side']] = train_dataset[['Deck', 'Cabin_num', 'Side']].fillna(value=0)
train_dataset['Deck'] = train_dataset['Deck'].replace({'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8})
train_dataset['Side'] = train_dataset['Side'].replace({'P':1, 'S':2})
train_dataset['Cabin_num'] = train_dataset['Cabin_num'].astype(int)
train_dataset = train_dataset.drop('Cabin', axis=1)

In [158]:
train_dataset.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,2,0,1,39.0,0,0.0,0.0,0.0,0.0,0.0,0,2,0,1
1,1,0,1,24.0,0,109.0,9.0,25.0,549.0,44.0,1,6,0,2
2,2,0,1,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,0,2
3,2,0,1,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,2
4,1,0,1,16.0,0,303.0,70.0,151.0,565.0,2.0,1,6,1,2


In [159]:
#train and val split
train_df = train_dataset.sample(frac=0.8, random_state=123)
val_df = train_dataset.drop(train_df.index)


In [160]:
#convert to tensors
train_x = torch.tensor(train_df.drop("Transported", axis=1).values, dtype=torch.float32)
train_y = torch.tensor(train_df["Transported"].values, dtype=torch.float32)
val_x = torch.tensor(val_df.drop("Transported", axis=1).values, dtype=torch.float32)
val_y = torch.tensor(val_df["Transported"].values, dtype=torch.float32)

train_dataset = TensorDataset(train_x, train_y)
val_dataset = TensorDataset(val_x, val_y)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [161]:
# hyperparameters
input_size = train_x.shape[1]
num_classes = 1 #if probability greater than 0.5, then True, if less then False
learning_rate = 0.001  
num_epochs = 300

In [162]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN,self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100,50)
        self.fc3 = nn.Linear(50, num_classes)
        
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [163]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NN(input_size=input_size, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)  

In [164]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of iter(training_loader) so that we can track the batch index and do some intra-epoch reporting
    for index, data in enumerate(train_loader):

        # Every data instance is an input + label pair
        batch_x, batch_y = data
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(batch_x)

        # Compute the loss and its gradients
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item() #retrieves the scalar value of the loss function for the current batch
        if index % 10 == 9: #reports on loss every 10 batches
            last_loss = running_loss / 10 # loss per batch
            print('  batch {} loss: {}'.format(index + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + index + 1  #variable is used to set the x-axis value for the scalar summary in TensorBoard, based on the current epoch and batch index
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0

    return last_loss #last calculated batch loss

In [165]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/spaceship_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for index, vdata in enumerate(val_loader):
        vbatch_x, vbatch_y = vdata
        voutputs = model(vbatch_x)
        vloss = criterion(voutputs.squeeze(), vbatch_y)
        running_vloss += vloss

    avg_vloss = running_vloss / (index + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 10 loss: nan
  batch 20 loss: nan
  batch 30 loss: nan
  batch 40 loss: nan
  batch 50 loss: nan
  batch 60 loss: nan
  batch 70 loss: nan
  batch 80 loss: nan
  batch 90 loss: nan
  batch 100 loss: nan
LOSS train nan valid nan
EPOCH 2:
  batch 10 loss: nan
  batch 20 loss: nan
  batch 30 loss: nan
  batch 40 loss: nan
  batch 50 loss: nan
  batch 60 loss: nan
  batch 70 loss: nan
  batch 80 loss: nan
  batch 90 loss: nan
  batch 100 loss: nan
LOSS train nan valid nan
EPOCH 3:
  batch 10 loss: nan
  batch 20 loss: nan
  batch 30 loss: nan
  batch 40 loss: nan
  batch 50 loss: nan
  batch 60 loss: nan
  batch 70 loss: nan
  batch 80 loss: nan
  batch 90 loss: nan
  batch 100 loss: nan
LOSS train nan valid nan
EPOCH 4:
  batch 10 loss: nan
  batch 20 loss: nan
  batch 30 loss: nan
  batch 40 loss: nan
  batch 50 loss: nan
  batch 60 loss: nan
  batch 70 loss: nan
  batch 80 loss: nan
  batch 90 loss: nan
  batch 100 loss: nan
LOSS train nan valid nan
EPOCH 5:
  batch 10 

In [166]:
def check_accuracy(loader,model): 
    if loader == train_loader: 
        print("checking accuracy on training data")
    else:
        print("checking accuracy on test data")
    num_correct = 0
    num_samples = 0
    model.eval() 
    
    with torch.no_grad(): 
        for x,y in loader:
            x= x.to(device=device)
            y= y.to(device=device)
            
            outputs = model(x)
            probs = torch.sigmoid(outputs).squeeze() #maps any input value to a probability value between 0 and 1
            preds = (probs > 0.5).to(torch.float32) #applies a threshold of 0.5 to the predicted probabilities, then converts boolean to float
            
            num_correct += (preds == y).sum()
            num_samples += x.shape[0]
            
        print(f'{num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')   #2dp
        
    model.train()

In [167]:
check_accuracy(train_loader,model)
check_accuracy(val_loader,model)

checking accuracy on training data
3447 / 6954 with accuracy 49.57
checking accuracy on test data
868 / 1739 with accuracy 49.91
