# Survival prediction of titanic passengers

---
## Set up 

### Load modules

In [None]:
# libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

#local modules
from barplot import plot_barplot

### Set display options

In [None]:
# allow multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Plot the Figures Inline
%matplotlib inline

# Prevent label cut off from figures 
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

---
## Data loader

In [None]:
# get metadata
meta_data = pd.read_csv("data/metadata.csv")
meta_data

In [None]:
# load train data
train_data = pd.read_csv("data/titanic-train.csv")
print("Shape: ", train_data.shape)
train_data.head()

In [None]:
# load test data
test_data = pd.read_csv("data/titanic-test.csv")
print("Shape: ", test_data.shape)
test_data.head()

---
## Data exploration

### Check if the datasets contain missing values

In [None]:
missing_values = pd.DataFrame({'Training set': train_data.isna().sum(), 
                               'Test set': test_data.isna().sum()})
missing_values

Conclusion: There are many missing values for the age of passengers and the cabin type. Therefore, these features will be excluded from the following analyses.

### Count the number of unique values of features of interest

In [None]:
train_data["Sex"].nunique()
train_data["SibSp"].nunique()
train_data["Parch"].nunique()
train_data["Fare"].nunique()

Conclusion: there are many different fares that are assumably associated with the ticket class. Let's check this: 

### Investigate fares

In [None]:
# check min and max prices of fares per class
class1 = train_data.loc[train_data['Pclass'] == 1]
print("Max fare class 1: ", class1["Fare"].max())
print("Min fare class 1: ",class1["Fare"].min())
print()

class2 = train_data.loc[train_data['Pclass'] == 2]
print("Max fare class 2: ", class2["Fare"].max())
print("Min fare class 2: ",class2["Fare"].min())
print()

class3 = train_data.loc[train_data['Pclass'] == 3]
print("Max fare class 3: ", class3["Fare"].max())
print("Min fare class 3: ",class3["Fare"].min())

In [None]:
# plot fares per class as histograms
data_list = [class1["Fare"].to_numpy(), 
             class2["Fare"].to_numpy(), 
             class3["Fare"].to_numpy()]

fig, ax = plt.subplots(1,len(data_list), figsize=(15, 5))
for i, data in enumerate(data_list):
    ax[i].hist(data, bins=20)
    ax[i].set_title(f"Class {i+1}")
    ax[i].set_xlabel("Fares")
    ax[i].set_ylabel("Frequency")

Conclusion: The fares of the 3 different classes overlap, especially the fares of class 2 and 3. It might therefore be more useful to predict survival rates depending on passenger class rather than fare. Let's check among the categorical features if there are categories that are (strongly) associated with survival rate.

### Investigate survival rates per categories

#### Passenger class: 

In [None]:
# save categories in list and convert them to string variables for plotting
categories_class = list(map(str, train_data["Pclass"].unique()))
categories_class.sort()
categories_class

# calculate percentage of survivors per passenger class
class1_surv = round(class1["Survived"].sum()/len(class1["Survived"])*100, 1)
class2_surv = round(class2["Survived"].sum()/len(class2["Survived"])*100, 1)
class3_surv = round(class3["Survived"].sum()/len(class3["Survived"])*100, 1)

# store results in list
survivors_per_class = [class1_surv, class2_surv, class3_surv]

In [None]:
# plot survivors per class
plot_barplot(categories_class, 
             survivors_per_class, 
             title="Survivors per passenger class", 
             xlabel="Passenger classes")

Conclusion: the survival rate seems to be correlated to the passenger class and therefore likely influences the prediction of survival.

#### Gender:

In [None]:
# save categories in list
categories_gender = list(map(str, train_data["Sex"].unique()))
categories_gender

# calculate percentage of survivors per gender
men = train_data.loc[train_data.Sex == 'male']["Survived"].to_numpy()
women = train_data.loc[train_data.Sex == 'female']["Survived"].to_numpy()
men_surv = round(sum(men)/len(men)*100, 1)
women_surv = round(sum(women)/len(women)*100, 1)

# store results in list
survivors_per_gender = [men_surv, women_surv]

In [None]:
# plot survivors per gender
plot_barplot(categories_gender, 
             survivors_per_gender, 
             title="Survivors per gender", 
             xlabel="Gender")

Conclusion: the survival rate of women is much higher than the survival rate of men. Therefore, the gender likely has a strong influence on the prediction of survival.

#### Number of siblings/ spouses aboard

In [None]:
# save categories in list
categories_sibsp = list(train_data["SibSp"].unique())
categories_sibsp.sort()

# calculate percentage of survivors per number of siblings/ spouses aboard
# and save results in list
survivors_per_sibsp = []
for i in categories_sibsp:
    sibsp = train_data.loc[train_data.SibSp == i]["Survived"].to_numpy()
    survivors_per_sibsp.append(round(sum(sibsp)/len(sibsp)*100, 1))

# convert categories to string variables for plotting
categories_sibsp = list(map(str, categories_sibsp))
categories_sibsp

In [None]:
# plot survivors per number of siblings/ spouses aboard
plot_barplot(categories_sibsp, 
             survivors_per_sibsp, 
             title="Survivors per number of siblings/ spouses aboard", 
             xlabel="Number of siblings/ spouses aboard")

Conclusion: The people with 1 or 2 siblings/ spouses aboard had the highest rate of survival. This could mean that these people had support from family members with getting a spot in one of the lifeboats. Therefore, the number of siblings/ spouses might be associated with the chance of survival.

#### Number of parents/ children aboard

In [None]:
# save categories in list
categories_parch = list(train_data["Parch"].unique())
categories_parch.sort()

# calculate percentage of survivors per number of parents/ children aboard
# and save results in list
survivors_per_parch = []
for i in categories_parch:
    parch = train_data.loc[train_data.Parch == i]["Survived"].to_numpy()
    survivors_per_parch.append(round(sum(parch)/len(parch)*100, 1))

# convert categories to string variables for plotting
categories_parch = list(map(str, categories_parch))
categories_parch

In [None]:
# plot survivors per number of parents/ children aboard
plot_barplot(categories_parch, 
             survivors_per_parch, 
             title="Survivors per number of parents/ children aboard", 
             xlabel="Number of parents/ children aboard")

Conclusion: The people who had between 1 and 3 parents/ children aboard had the highest rate of survival. As above, this could mean that these people had support from family members with getting a spot in one of the lifeboats. Therefore, the number of parents/ children might be associated with the chance of survival.

#### Port of embarkation

In [None]:
# save categories in list
categories_embarked = list(map(str, train_data["Embarked"].unique()))
categories_embarked

# calculate percentage of survivors per port of embarkation
# note: leave out the two passengers of unknown port of embarkation
survivors_per_port = []
for i in categories_embarked[:3]:
    port = train_data.loc[train_data.Embarked == i]["Survived"].to_numpy()
    survivors_per_port.append(round(sum(port)/len(port)*100, 1))

In [None]:
# plot survivors per port of embarkation
plot_barplot(categories_embarked[:3], 
             survivors_per_port, 
             title="Survivors per port of embarkation", 
             xlabel="Port of embarkation")

Conclusion: the percentage of people who embarked in Cherbourg is higher compared to Southampton and Queenstown. This could be due to many first class passengers having embarked here. Let's check this: 

---
## Models

### Random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)

In [None]:
output