# Imports

In [39]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import bisect
import re
import torch 
from torch import nn
import torchmetrics



# Data

In [2]:
train_data = pd.read_csv('data/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Data exploration

In [4]:
woman = train_data[train_data.Sex == 'female']['Survived']
rate_woman = sum(woman) / len(woman)
print(f'Percentage of woman who survived: {rate_woman*100:.2f}')

Percentage of woman who survived: 74.20


In [5]:
man = train_data[train_data.Sex == 'male']['Survived']
rate_man = sum(man) / len(man)
print(f'Percentage of man who survived: {rate_man*100:.2f}')

Percentage of man who survived: 18.89


# Data preparation

#### Fill missing age values with median of Age

In [6]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

#### Fill missing Fare values with median of Fare

In [7]:
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

#### Convert 'Sex' column to integers

In [8]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

#### Convert 'Embarked' column to integers

In [9]:
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

#### Fill missing 'Embarked' values with most common value

In [10]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

#### Create a new column 'FamilySize' as a sum of 'SibSp' and 'Parch' columns

In [11]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

#### Extract 'Deck' from 'Cabin' column

In [12]:
train_data['Deck'] = train_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')
test_data['Deck'] = test_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')

#### Extract 'NumCabins' from 'Cabin' column

In [13]:
train_data['NumCabins'] = train_data['Cabin'].apply(lambda x: len(str(x).split(' ')) if pd.notna(x) else 0)
test_data['NumCabins'] = test_data['Cabin'].apply(lambda x: len(str(x).split(' ')) if pd.notna(x) else 0)

#### Extract 'TicketPrefix' from 'Ticket' column

In [14]:
train_data['TicketPrefix'] = train_data['Ticket'].apply(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'None')
test_data['TicketPrefix'] = test_data['Ticket'].apply(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'None')


#### Extract 'TicketFrequency' from 'Ticket' column

In [15]:
train_data['TicketFrequency'] = train_data.groupby('Ticket')['Ticket'].transform('count')
test_data['TicketFrequency'] = test_data.groupby('Ticket')['Ticket'].transform('count')


#### Drop 'Cabin', 'Ticket' and fill missing values in 'Embarked' 

In [16]:
if 'Cabin' in train_data.columns:
    train_data.drop('Cabin', axis=1, inplace=True)

if 'Cabin' in test_data.columns:
    test_data.drop('Cabin', axis=1, inplace=True)

embarked_mode = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(embarked_mode, inplace=True)
test_data['Embarked'].fillna(embarked_mode, inplace=True)  # using mode from train_data

if 'Ticket' in train_data.columns:
    train_data.drop('Ticket', axis=1, inplace=True)

if 'Ticket' in test_data.columns:
    test_data.drop('Ticket', axis=1, inplace=True)


#### Extracting titles from 'Name' column

In [17]:
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

In [18]:
train_data['Title'] = train_data['Name'].apply(extract_title)
test_data['Title'] = test_data['Name'].apply(extract_title)

#### Encoding caterogical features

In [19]:
label_encoders = {}

for column in ['Deck', 'TicketPrefix', 'Title']:  # Added 'Title' to the list
    
    # Convert the columns to string type
    train_data[column] = train_data[column].astype(str)
    test_data[column] = test_data[column].astype(str)
    
    # Initialize and fit the label encoder
    le = LabelEncoder()
    le.fit(train_data[column])
    train_data[column] = le.transform(train_data[column])
    
    # Handle unseen labels in the test data
    test_data[column] = test_data[column].map(lambda x: 'Unknown' if x not in le.classes_ else x)
    
    # Add 'Unknown' to the classes of the encoder and sort them
    le_classes = le.classes_.tolist()
    if 'Unknown' not in le_classes:
        bisect.insort_left(le_classes, 'Unknown')
        le.classes_ = np.array(le_classes)
    
    # Transform the test data
    test_data[column] = le.transform(test_data[column])
    
    # Store the label encoder for future reference
    label_encoders[column] = le


# Check data after preparation

In [20]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Deck,NumCabins,TicketPrefix,TicketFrequency,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0.0,1,8,0,4,1,12
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1.0,1,2,1,18,1,13
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0.0,0,8,0,37,1,9
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0.0,1,2,1,16,2,13
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0.0,0,8,0,16,1,12


# Split data

In [21]:
X_train = train_data.drop(columns = ['Name', 'Survived'], axis=1)
y_train = train_data['Survived']
X_test = test_data.drop('Name', axis=1)

# Random Forest Classifier model

In [22]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Prediction and save results

In [24]:
predictions = rf.predict(X_test)

submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})

submission.to_csv('data/RFCsubmission.csv', index=False)

# Neural Network Model

In [27]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

#### Model

In [50]:
class TitanicNN(nn.Module):
    def __init__(self, input_dim):
        super(TitanicNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, 32)
        self.layer4 = nn.Linear(32, 16)
        self.layer5 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.layer5(x)
        return x


In [29]:
model = TitanicNN(X_train.shape[1]).to(device)
model

TitanicNN(
  (layer1): Linear(in_features=14, out_features=64, bias=True)
  (layer2): Linear(in_features=64, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=32, bias=True)
  (layer4): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)

#### Loss and optimizer

In [42]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()
accuracy_fn = torchmetrics.Accuracy(task='Binary').to(device)

#### Prepare data for training

In [54]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32, device=device).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32, device=device)

train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)

#### Training loop

In [51]:
epochs = 250

for epoch in range(epochs):
    epoch_loss = 0
    epoch_acc = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = loss_fn(outputs, targets)
        epoch_loss += loss.item()

        # Compute predictions
        preds = torch.sigmoid(outputs) > 0.5
        acc = accuracy_fn(preds.int(), targets.int())
        epoch_acc += acc.item()

        loss.backward()
        optimizer.step()

    average_epoch_loss = epoch_loss / len(train_loader)
    average_epoch_acc = epoch_acc / len(train_loader)
    if epoch % 10 == 0:  
        print(f'Epoch {epoch} loss {average_epoch_loss:.4f} accuracy {average_epoch_acc:.4f}')


Epoch 0 loss 0.5634 accuracy 0.7171
Epoch 10 loss 0.5757 accuracy 0.7104
Epoch 20 loss 0.5749 accuracy 0.7127
Epoch 30 loss 0.5626 accuracy 0.7138
Epoch 40 loss 0.5689 accuracy 0.7221
Epoch 50 loss 0.5609 accuracy 0.7077
Epoch 60 loss 0.5832 accuracy 0.7021
Epoch 70 loss 0.5625 accuracy 0.7094
Epoch 80 loss 0.5646 accuracy 0.7099
Epoch 90 loss 0.5673 accuracy 0.7166
Epoch 100 loss 0.5629 accuracy 0.7182
Epoch 110 loss 0.5696 accuracy 0.7176
Epoch 120 loss 0.5679 accuracy 0.7266
Epoch 130 loss 0.5657 accuracy 0.7133
Epoch 140 loss 0.5624 accuracy 0.7200
Epoch 150 loss 0.5777 accuracy 0.6921
Epoch 160 loss 0.5623 accuracy 0.7121
Epoch 170 loss 0.5688 accuracy 0.7227
Epoch 180 loss 0.5619 accuracy 0.7122
Epoch 190 loss 0.5670 accuracy 0.7122
Epoch 200 loss 0.5713 accuracy 0.7189
Epoch 210 loss 0.5650 accuracy 0.7143
Epoch 220 loss 0.5720 accuracy 0.7066
Epoch 230 loss 0.5854 accuracy 0.7138
Epoch 240 loss 0.5659 accuracy 0.7183


#### Make predictions

In [None]:
model.eval()
with torch.inference_mode():
    predictions = model(X_test_tensor)
    predictions = torch.sigmoid(predictions) > 0.5
    predictions = predictions.cpu().numpy().astype(int).squeeze()

#### Make submission

In [58]:
submission_df = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions.squeeze()  
    })

submission_df.to_csv('data/NNsubmission.csv', index=False)