# Logistic Regression with the Titanic Survival Dataset
**Date:** January 25, 2025  
**Author:** Dario Piga  

In this notebook, we will implement a logistic regression model using `pytorch` to predict survival on the Titanic. Our goal is to understand how various factors such as passenger class, gender, age, and fare contribute to the likelihood of survival. By applying logistic regression, we will model these relationships to predict binary outcomes, specifically, whether a passenger survived or not.

## The Titanic Survival Dataset

The Titanic dataset is a historical dataset that contains data on the passengers aboard the RMS Titanic, which famously sank on its maiden voyage in 1912. This dataset includes the following features:

- `survived`: Survival (0 = No, 1 = Yes)
- `pclass`: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- `sex`: Gender (male or female)
- `age`: Age in years
- `fare`: Passenger fare
- and others not used in this exercise
  
We will explore the data, perform necessary preprocessing steps, and build a logistic regression model to predict whether a passenger survived based on their features. We will also evaluate our model's performance using various metrics to understand its effectiveness.


In [None]:
import seaborn as sns
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [None]:
# Load dataset
df = sns.load_dataset('titanic')

# Select specific columns
features = ['pclass', 'sex', 'age', 'fare']
df = df[features + ['survived']]

# Drop rows with any NaN values
df.dropna(inplace=True)

# Print the DataFrame
print(df)

In [None]:
# Convert gender into binary variable (0 or 1)
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
# Selecting specific features
X = df[features]
y = df['survived']


In [None]:
# Plot your data (TBD)
...

In [None]:
# Plot your data (solution)

for f in features:
    plt.figure(figsize= (3, 3))
    plt.hist(df[f], bins = 20)
    plt.title(f'Histogram of feature {f}')
    plt.xlabel(f)
    plt.ylabel('Frequency')


plt.figure()
plt.hist(y, bins = 20)
plt.title(f"Histogram of target variable: MEDV")
plt.xlabel('Survived')
plt.ylabel('Frequency')

In [None]:
# Splitting the dataset into training and testing sets and normalize (TBD)
...

In [None]:
# Splitting the dataset into training and testing sets and normalize (solution)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features and output to have zero mean and unitary std

# Normalize training and test dataset. Note that training and test datasets are normalized using the same mean and std 
X_mean = X_train.mean(axis = 0)
X_std = X_train.std(axis = 0)
X_train = (X_train - X_mean)/X_std 
X_test = (X_test - X_mean)/X_std


# sanity check:
print(f"Training features: \n Mean:\n {X_train.mean(axis = 0)} \n Std:\n {X_train.std(axis = 0)}\n")
print(f"Training target: \n Mean:\n {y_train.mean():.2f} \n Std:\n {y_train.std():.2f}")


In [None]:
# Convert datasets to tensors

X_train_tensor = torch.tensor(X_train.values, dtype = torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype = torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype = torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype = torch.float32)

In [None]:
# Create logistic regression model in Pytorch (TBD)
...


In [None]:
# Define model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        # Linear layer
        self.linear = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Applying the linear layer and then the sigmoid function
        outputs = self.sigmoid(self.linear(x))
        return outputs

model = LogisticRegressionModel(input_dim = X.shape[1])

print(f"Model structure: {model}")

for name, params in model.named_parameters():
    print(f"parameter name: {name}. Value {params.data}")

# check what model provides:
y_hat = model(X_train_tensor)
y_hat

In [None]:
# Define the loss function for binary classification
criterion = nn.BCELoss() # Binary cross entropy

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)


In [None]:
# training loop (TBD)
...

In [None]:
# training loop (solution)
max_epochs = 1000
for it in range(max_epochs):
    optimizer.zero_grad()
    p_hat = model(X_train_tensor)
    loss = criterion(p_hat, y_train_tensor.reshape(-1,1))
    loss.backward()
    optimizer.step()
    if it % 5 == 0:
        y_hat = p_hat > 0.5
        accuracy = (y_hat == y_train_tensor.view(-1,1)).float().sum()/y_hat.shape[0]
        print(f"Iteration: {it}. Loss: {loss.item() :3f}. Accuracy: {accuracy.item()}")
    

In [None]:
# assess performance (TBD)

def assess_results(model, X, y, data_type):
    with torch.no_grad():
        

        # Compute accuracy
        ...

        # Compute confusion matrix
        
        TP = ...  # True Positive
        TN = ...  # True Negative
        FP = ...  # False Positive
        FN = ...  # False Negative

        # Print confusion matrix results
        print(f"Accuracy: {accuracy*100:.3f} % ")
        print(f"Confusion Matrix for {data_type} Data:")
        print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")
        


In [None]:
# assess performance (solution)

import matplotlib.pyplot as plt

def assess_results(model, X, y, data_type):
    with torch.no_grad():
        p_hat = model(X).squeeze()  # Get the predicted probabilities
        y_hat = (p_hat > 0.5).float()  # Convert probabilities to 0 or 1 based on threshold

        # Compute accuracy
        accuracy = torch.mean((y_hat == y).float()).item()  # Convert boolean to float and calculate mean

        # Compute confusion matrix
        TP = torch.sum((y_hat == 1) & (y == 1)).item()  # True Positive
        TN = torch.sum((y_hat == 0) & (y == 0)).item()  # True Negative
        FP = torch.sum((y_hat == 1) & (y == 0)).item()  # False Positive
        FN = torch.sum((y_hat == 0) & (y == 1)).item()  # False Negative

        # Print confusion matrix results
        print(f"Accuracy: {accuracy*100:.3f} % ")
        print(f"Confusion Matrix for {data_type} Data:")
        print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")
        


In [None]:
# Assess results in training
print('Training results')

assess_results(model, X_train_tensor, y_train_tensor, data_type = 'train')
#print(f"rmse = {rmse}. R2 = {R2:.3f}")


# Assess results in test
print('Test results')

assess_results(model, X_test_tensor, y_test_tensor, data_type = 'test')
#print(f"rmse = {rmse}. R2 = {R2:.3f}")
