# Notebook to predict survival of passengers

First model: try Pclass & sex to predict (they have no missing values)
Then include embarkment: check how sklearn handles missing values

In [1]:
import numpy as np
import pandas as pd

import survival_funcs as fcs

In [2]:
from sklearn.naive_bayes import CategoricalNB 

In [3]:
#### User input ####
train_data = "../data/titanic/train.csv"

## the columns used for training
## (SibSp: number of siblings/spouses on board)
## (Parch: number of parents on board)
cols_read = ['PassengerId','Survived', 'Pclass', 'Sex', 'Age','Embarked','SibSp', 'Parch', 'Fare', 'Cabin']

## schema for the training dataset
train_schema = {'PassengerId': np.int16,
               'Survived': np.int16,
               'Pclass': np.int16,
               'Sex': str,
               'Age': np.float64,
               'Embarked': str}

## hash maps to convert enum values to numerals
sex_dict = {"female": 1,
           "male": 2}

emb_dict = {"S": 1,
           "Q": 2,
           "C": 3}

In [4]:
## read data into Dataframe
df_train = pd.read_csv(train_data, usecols = cols_read, dtype = train_schema)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int16  
 1   Survived     891 non-null    int16  
 2   Pclass       891 non-null    int16  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Cabin        204 non-null    object 
 9   Embarked     889 non-null    object 
dtypes: float64(2), int16(3), int64(2), object(3)
memory usage: 54.1+ KB


In [5]:
## create numeral values for the enumerated classes
df_train = fcs.trans_cols(df_train, "Sex", "Sex_num", sex_dict)
df_train = fcs.trans_cols(df_train, "Embarked", "Embarked_num", emb_dict)

## remove nans for "Embarked_num"
df_train["Embarked_num"].fillna(0, inplace=True)

## inspect the training data
df_train.info()
df_train.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int16  
 1   Survived      891 non-null    int16  
 2   Pclass        891 non-null    int16  
 3   Sex           891 non-null    object 
 4   Age           714 non-null    float64
 5   SibSp         891 non-null    int64  
 6   Parch         891 non-null    int64  
 7   Fare          891 non-null    float64
 8   Cabin         204 non-null    object 
 9   Embarked      889 non-null    object 
 10  Sex_num       891 non-null    float64
 11  Embarked_num  891 non-null    float64
dtypes: float64(4), int16(3), int64(2), object(3)
memory usage: 68.0+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Embarked_num"].fillna(0, inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_num,Embarked_num
0,1,0,3,male,22.0,1,0,7.25,,S,2.0,1.0
1,2,1,1,female,38.0,1,0,71.2833,C85,C,1.0,3.0
2,3,1,3,female,26.0,0,0,7.925,,S,1.0,1.0
3,4,1,1,female,35.0,1,0,53.1,C123,S,1.0,1.0
4,5,0,3,male,35.0,0,0,8.05,,S,2.0,1.0
5,6,0,3,male,,0,0,8.4583,,Q,2.0,2.0
6,7,0,1,male,54.0,0,0,51.8625,E46,S,2.0,1.0
7,8,0,3,male,2.0,3,1,21.075,,S,2.0,1.0
8,9,1,3,female,27.0,0,2,11.1333,,S,1.0,1.0
9,10,1,2,female,14.0,1,0,30.0708,,C,1.0,3.0


In [6]:
## survival rate by number of Pclass
fcs.get_group_probabilities(df_train, "Pclass")

Unnamed: 0_level_0,survival rate,people in group
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.62963,216
2,0.472826,184
3,0.242363,491


In [7]:
## survival rate by number of Sex
fcs.get_group_probabilities(df_train, "Sex")

Unnamed: 0_level_0,survival rate,people in group
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.742038,314
male,0.188908,577


In [8]:
## survival rate by number of location of embarkment
fcs.get_group_probabilities(df_train, "Embarked")

Unnamed: 0_level_0,survival rate,people in group
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.553571,168
Q,0.38961,77
S,0.336957,644


In [9]:
## survival rate by number of siblings 
fcs.get_group_probabilities(df_train, "SibSp")

Unnamed: 0_level_0,survival rate,people in group
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.345395,608
1,0.535885,209
2,0.464286,28
3,0.25,16
4,0.166667,18
5,0.0,5
8,0.0,7


In [10]:
## survival rate by number of parents/children on board 
fcs.get_group_probabilities(df_train, "Parch")

Unnamed: 0_level_0,survival rate,people in group
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.343658,678
1,0.550847,118
2,0.5,80
3,0.6,5
4,0.0,4
5,0.2,5
6,0.0,1


### Conclusions
Both the sex and social class are important parameters to predict whether someone survives. 
Also embarkment seems to play a role survival rate.

## Naive Bayes categorical

### Using categories with no missing values

In [11]:
## create categorical Bayes object
clf1 = CategoricalNB()

In [12]:
## create input data
X1 = fcs.get_features(df_train, ["Pclass", "Sex_num", "SibSp", "Parch"])
Y1 = np.array(df_train["Survived"])

## create model 1 and predict the results
model1 = clf1.fit(X1, Y1)
results1 = model1.predict(X1)

In [13]:
## print the accuracy
error_rate1 = np.sum(np.abs(Y1-results1))/len(results1)
print(1. - error_rate1)

0.7946127946127945


### Also including categories with missing values

In [14]:
## create categorical Bayes object
clf2 = CategoricalNB()

In [15]:
## create input data
X2 = fcs.get_features(df_train, ["Pclass", "Sex_num", "Embarked_num", "SibSp", "Parch"])

## create model 1 and predict the results
model2 = clf2.fit(X2, Y1)
results2 = model2.predict(X2)

ValueError: X has 5 features, but CategoricalNB is expecting 4 features as input.

In [None]:
## print the accuracy
error_rate2 = np.sum(np.abs(Y1-results2))/len(results2)
print(1. - error_rate2)