# Titanic - Machine Learning from Disaster


Kaggle link: https://www.kaggle.com/code/lorenzozanolin/logistic-regression

W&B link: https://wandb.ai/lorenzozanolin-52/logistic_regression/table?workspace=user-lorenzozanolin-52

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#!pip install wandb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):    # ''
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


Import all the needed library and init Weights and Biases

In [2]:
import numpy as np

import torch
import torch.nn as nn
torch.manual_seed(0)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
import pandas as pd

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("a")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

We first need to read the datasets

In [3]:
titanic_training_data = pd.read_csv('/kaggle/input/titanic/train.csv')    #/kaggle/input/titanic/train.csv './titanic/train.csv'
titanic_test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
titanic_training_data.shape
titanic_training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Dataframe needs to be cleaned, knowing if some informations are unknown can be very important to determine if someone survived

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor  
def remove_multicollinearity(df,features):   #we will use it to remove features whom VIF is higher than 20
    vif_data=pd.DataFrame()
    vif_data["Feature"]=features # names of the features
    vif_data["VIF"]=[variance_inflation_factor(df[features].values,i) for i in range(len(features))] # VIF score for each features, higher VIF means higher correlation
    return vif_data.sort_values(by=["VIF"]).reset_index(drop=True)
        

In [5]:
def clean_titanic(df, train=True):
    df["Cabin"] = df["Cabin"].apply(lambda x: pd.isna(x)).astype(bool)  # will set True for each missing Cabin value, False for each cabin whom value was known
    df["Embarked"] = df["Embarked"].apply(lambda x: pd.isna(x)).astype(bool) # same as before
    df["AgeNan"] = df["Age"].apply(lambda x: pd.isna(x)).astype(bool) # same as before
    df = pd.concat([df, pd.get_dummies(df['Sex'], dtype='bool', prefix='sex_'), pd.get_dummies(df['Pclass'], dtype='bool', prefix='pclass_')], axis=1) # adds new columns to the pre-existing dataframe. pd.get_dummies() encodes categorical variables into one-hot encoded dummy variables
    df = df.drop(['PassengerId', 'Name','Ticket','Sex','Pclass'], axis=1) # removes useless features
    if train:
        df = df.drop(['Survived'], axis=1) # removes last column since we are considering the training set
    numeric_features = df.dtypes[(df.dtypes != 'object') & (df.dtypes != 'bool')].index # This results in a list of column names corresponding to the numeric features
    df[numeric_features] = df[numeric_features].apply(lambda x: (x - x.mean()) / (x.std())) #mean normalization
    df["Age"] = df["Age"].fillna(df["Age"].mean()) # fills empty values with the mean
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean()) # same
    
    return df

y_data = torch.tensor(titanic_training_data["Survived"].values, dtype=torch.float32)
X_data = clean_titanic(titanic_training_data)
X_data.head()


Unnamed: 0,Age,SibSp,Parch,Fare,Cabin,Embarked,AgeNan,sex__female,sex__male,pclass__1,pclass__2,pclass__3
0,-0.530005,0.43255,-0.473408,-0.502163,True,False,False,False,True,False,False,True
1,0.57143,0.43255,-0.473408,0.786404,False,False,False,True,False,True,False,False
2,-0.254646,-0.474279,-0.473408,-0.48858,True,False,False,True,False,False,False,True
3,0.364911,0.43255,-0.473408,0.420494,False,False,False,True,False,True,False,False
4,0.364911,-0.474279,-0.473408,-0.486064,True,False,False,False,True,False,False,True


We then transform the data from numpy (pandas representation) into torch's `Tensor`

In [6]:
X_data = torch.tensor(X_data.astype('float').values, dtype=torch.float32)    # create a tensor where each value is a FLOAT
X_data


tensor([[-5.3001e-01,  4.3255e-01, -4.7341e-01,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 5.7143e-01,  4.3255e-01, -4.7341e-01,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.5465e-01, -4.7428e-01, -4.7341e-01,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00],
        ...,
        [ 2.3386e-16,  4.3255e-01,  2.0078e+00,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00],
        [-2.5465e-01, -4.7428e-01, -4.7341e-01,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.5839e-01, -4.7428e-01, -4.7341e-01,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00]])

Create a `TensorDataset` to get tuple of data and label

In [7]:
dataset = torch.utils.data.TensorDataset(X_data, y_data)

We then split between the training and validation set

In [8]:
training_size = int(0.7 * len(dataset))
validation_size = len(dataset) - training_size
train, val = torch.utils.data.random_split(dataset, [training_size, validation_size], generator=torch.Generator().manual_seed(0))
data_loader_train = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True) #prima era a 32
data_loader_val = torch.utils.data.DataLoader(val, batch_size=32, shuffle=True) #prima era a 10

Layer initialization using Xavier Uniform on the weight and a constant 0 value on the bias

In [9]:
import torch.nn.functional as F    

def init_my_layer(m, gain=1):
    torch.nn.init.xavier_normal_(m.weight, gain)
    torch.nn.init.constant_(m.bias, 0)
    return m

class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__() 
        #self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.ln = init_my_layer(nn.Linear(12, 1), nn.init.calculate_gain('sigmoid'))
        #self.ln1 = init_my_layer(nn.Linear(12, 5), nn.init.calculate_gain('tanh'))
        #self.ln2 = init_my_layer(nn.Linear(5, 1), nn.init.calculate_gain('sigmoid'))
        
    def forward(self, x):
        #x = self.tanh(self.ln1(x))
        #x = self.sigmoid(self.ln2(x))
        x = self.sigmoid(self.ln(x))
        #return F.sigmoid(x) 
        return x

Create the LinearModel with one Linear layer and Sigmoid applied to the output

In [10]:
net = MyNetwork() 
print(list(net.parameters()))

[Parameter containing:
tensor([[-0.1316,  0.1790, -0.2453,  0.1780, -0.5389,  1.1169, -0.2872,  0.1503,
          0.4314, -0.0344,  0.1629,  0.2520]], requires_grad=True), Parameter containing:
tensor([0.], requires_grad=True)]


Initialize the network (call it `net`, it would makes things easier later), the loss, the optimizer and write the training loop

Don't forget to check the validation loss and save your model at the end of each epoch!

In [11]:
from torch.autograd import Variable
num_epochs = 400 
lr = 3e-3 
wandb.init(project="logistic_regression",config={"lr": lr, "epochs": num_epochs}) 
criterion = nn.BCELoss()    #binary cross entropy loss

o = 's'

if o == 's':
    optimizer = torch.optim.SGD(net.parameters(), lr)
    #optimizer = torch.optim.SGD(net.parameters(), lr, weight_decay=1e-4)
    wandb.log({'optimizer':'SGD'})
elif o == 'sg':
    #optimizer = torch.optim.SGD(net.parameters(), lr, momentum=0.9)
    optimizer = torch.optim.SGD(net.parameters(), lr, momentum=0.9, weight_decay=1e-4)
    wandb.log({'optimizer':'SGD-M'})
elif o == 'r':
    optimizer = torch.optim.RMSprop(net.parameters(), lr)
    wandb.log({'optimizer':'RMS'})
elif o == 'a':
    optimizer = torch.optim.Adam(net.parameters(), lr)
    wandb.log({'optimizer':'Adam'})
    
for epoch in range(num_epochs):
    training_loss = 0
    #TRAINING LOOP
    for X,y in data_loader_train:
        optimizer.zero_grad()
        y_pred=net(X)
        loss=criterion(y_pred,y.reshape(-1, 1))
        training_loss += loss
        loss.sum().backward()
        optimizer.step()
    validation_loss = 0
    with torch.no_grad():
        #VALIDATION LOOP
        for X,y in data_loader_val:
            y_pred=net(X)
            loss=criterion(y_pred,y.reshape(-1, 1))
            validation_loss+=loss

    print({'epoch':(epoch), 'training_loss': (training_loss/32).item(), 'validation_loss': (validation_loss/32).item()})
    wandb.log({'training loss': (training_loss/32).item()}, step=epoch)
    wandb.log({'validation loss': (validation_loss/32).item()}, step=epoch)

[34m[1mwandb[0m: Currently logged in as: [33mlorenzozanolin-52[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231114_150155-3gwpzlny[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33musual-capybara-68[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/lorenzozanolin-52/logistic_regression[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/lorenzozanolin-52/logistic_regression/runs/3gwpzlny[0m


{'epoch': 0, 'training_loss': 0.44058147072792053, 'validation_loss': 0.201175719499588}
{'epoch': 1, 'training_loss': 0.43604183197021484, 'validation_loss': 0.20046254992485046}
{'epoch': 2, 'training_loss': 0.43021202087402344, 'validation_loss': 0.19862404465675354}
{'epoch': 3, 'training_loss': 0.42582178115844727, 'validation_loss': 0.19708026945590973}
{'epoch': 4, 'training_loss': 0.42320096492767334, 'validation_loss': 0.19482684135437012}
{'epoch': 5, 'training_loss': 0.4176844358444214, 'validation_loss': 0.19338904321193695}
{'epoch': 6, 'training_loss': 0.41406551003456116, 'validation_loss': 0.19135496020317078}
{'epoch': 7, 'training_loss': 0.41195860505104065, 'validation_loss': 0.1904674470424652}
{'epoch': 8, 'training_loss': 0.4071662127971649, 'validation_loss': 0.19002071022987366}
{'epoch': 9, 'training_loss': 0.4059814214706421, 'validation_loss': 0.1888469159603119}
{'epoch': 10, 'training_loss': 0.4034539461135864, 'validation_loss': 0.1875779628753662}
{'epoch

Now let's see the accuracy on the predictions, then we will create the submission file.

This loop computes the prediction on the test dataset and create a submission file


In [12]:
titanic_test_data_cleaned = clean_titanic(titanic_test_data, train=False)
titanic_data_tensor = torch.tensor(titanic_test_data_cleaned.astype('float').values, dtype=torch.float32)

test = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

with torch.no_grad():
    net.eval()
    test_pred = torch.LongTensor()
    for i, data in enumerate(titanic_data_tensor):
        output = net(data)
        predicted = torch.ge(output, 0.5)
        test_pred = torch.cat((test_pred, predicted), dim=0)
    out_df = pd.DataFrame(np.c_[titanic_test_data['PassengerId'].values, test_pred.numpy()], columns=['PassengerId', 'Survived'])
    out_df.to_csv('submission.csv', index=False)