In [1]:
import pandas as pd

In [2]:
titanic_data = pd.read_csv('datasets/titanic/train.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Clean dataset from the unwanted features
unwanted_features = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Embarked']

titanic_data = titanic_data.drop(unwanted_features, axis=1)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [4]:
# Get rid off missing data
titanic_data = titanic_data.dropna()

In [5]:
from sklearn import preprocessing

In [6]:
# Instead of one-hot encoding, we will use Label Encoder to encode categorical values as numeric labels
le = preprocessing.LabelEncoder()

In [7]:
# gender to numeric. 0:Female, 1:Male
titanic_data['Sex'] = le.fit_transform(titanic_data['Sex'])

titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [8]:
features = ['Pclass', 'Sex', 'Age', 'Fare']

In [9]:
titanic_features = titanic_data[features]

titanic_features.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,22.0,7.25
1,1,0,38.0,71.2833
2,3,0,26.0,7.925
3,1,0,35.0,53.1
4,3,1,35.0,8.05


In [10]:
# We typically use one-hot encoding for categorical values with more than 2 values
titanic_features = pd.get_dummies(titanic_features, columns=['Pclass'])
titanic_features.head()

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,7.25,0,0,1
1,0,38.0,71.2833,1,0,0
2,0,26.0,7.925,0,0,1
3,0,35.0,53.1,1,0,0
4,1,35.0,8.05,0,0,1


In [11]:
# 0:Didn't Survive, 1:Survived
titanic_target = titanic_data[['Survived']]
titanic_target.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, x_test, Y_train, y_test = train_test_split(titanic_features, titanic_target, test_size=0.2,random_state=0)

In [14]:
X_train.shape, Y_train.shape

((571, 6), (571, 1))

In [16]:
import torch
import numpy as np

In [17]:
Xtrain_ = torch.from_numpy(X_train.values).float()
Xtest_ = torch.from_numpy(x_test.values).float()

In [18]:
Xtrain_.shape

torch.Size([571, 6])

In [19]:
# We will use NLL Loss Function
# So, we need to reshape our data to match the y-label format that our loss func requires
# extract y-labels as 1D tensor -one row containing all labels
Ytrain_ = torch.from_numpy(Y_train.values).view(1,-1)[0]
Ytest_ = torch.from_numpy(y_test.values).view(1,-1)[0]

In [20]:
Ytrain_.shape, Ytest_.shape 

(torch.Size([571]), torch.Size([143]))

In [23]:
# For log Softmax function
import torch.nn as nn
import torch.nn.functional as F

In [24]:
# 6 input, 2 output (Survived or not)
input_size = 6
output_size = 2
hidden_size = 10

In [25]:
# Build our own NN modules by subclassing the nn.Module class
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Full linear, fully connected layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        # Last layer is linear layer with no activation
        x = self.fc3(x)
        
        #Dimension along which softmax will be computed, here we allow the function to infer the right dimension
        return F.log_softmax(x, dim=-1)

In [26]:
model = Net()

In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

loss_fn = nn.NLLLoss()

In [30]:
epoch_data = []

In [34]:
for epoch in range(1, 1001):
    optimizer.zero_grad()
    Ypred = model(Xtrain_)
    
    # Calculate the loss on prediction and back propogate to calculate gradients
    loss = loss_fn(Ypred, Ytrain_)
    loss.backward()
    
    # Update parameters by applying gradients
    optimizer.step()
    
    # There is no backward func call because weights will not updated based on test data
    Ypred_test = model(Xtest_)
    loss_test = loss_fn(Ypred_test, Ytest_)
    
    # Find the predicted value that has highest probability
    _,pred = Ypred_test.data.max(1)
    
    accuracy = (pred.eq(Ytest_.data).sum().item() / y_test.values.size)*100
    epoch_data.append([epoch, loss.data.item(), loss_test.data.item(), accuracy])
    
    if epoch % 100 == 0:
        print(accuracy)



55.24475524475524
69.23076923076923
67.83216783216784
69.23076923076923
72.02797202797203
78.32167832167832
83.21678321678321
84.61538461538461
82.51748251748252
83.21678321678321
