# Notebook to predict survival of passengers

First model: try Pclass & sex to predict (they have no missing values)
Then include embarkment: check how sklearn handles missing values

In [1]:
import numpy as np
import pandas as pd

import survival_funcs as fcs

In [2]:
from sklearn.naive_bayes import CategoricalNB 

In [3]:
#### User input ####
train_data = "../data/titanic/train.csv"

## the columns used for training
cols_read = ['PassengerId','Survived', 'Pclass', 'Sex', 'Age','Embarked','SibSp']

## schema for the training dataset
train_schema = {'PassengerId': np.int16,
               'Survived': np.int16,
               'Pclass': np.int16,
               'Sex': str,
               'Age': np.float64,
               'Embarked': str}

## hash maps to convert enum values to numerals
sex_dict = {"female": 1,
           "male": 2}

emb_dict = {"S": 1,
           "Q": 2,
           "C": 3}

In [4]:
## read data into Dataframe
df_train = pd.read_csv(train_data, usecols = cols_read, dtype = train_schema)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int16  
 1   Survived     891 non-null    int16  
 2   Pclass       891 non-null    int16  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Embarked     889 non-null    object 
dtypes: float64(1), int16(3), int64(1), object(2)
memory usage: 33.2+ KB


In [5]:
## create numeral values for the enumerated classes
df_train = fcs.trans_cols(df_train, "Sex", "Sex_num", sex_dict)
df_train = fcs.trans_cols(df_train, "Embarked", "Embarked_num", emb_dict)

## remove nans for "Embarked_num"
df_train["Embarked_num"].fillna(0, inplace=True)

## inspect the training data
df_train.info()
df_train.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int16  
 1   Survived      891 non-null    int16  
 2   Pclass        891 non-null    int16  
 3   Sex           891 non-null    object 
 4   Age           714 non-null    float64
 5   SibSp         891 non-null    int64  
 6   Embarked      889 non-null    object 
 7   Sex_num       891 non-null    float64
 8   Embarked_num  891 non-null    float64
dtypes: float64(3), int16(3), int64(1), object(2)
memory usage: 47.1+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Embarked_num"].fillna(0, inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Embarked,Sex_num,Embarked_num
0,1,0,3,male,22.0,1,S,2.0,1.0
1,2,1,1,female,38.0,1,C,1.0,3.0
2,3,1,3,female,26.0,0,S,1.0,1.0
3,4,1,1,female,35.0,1,S,1.0,1.0
4,5,0,3,male,35.0,0,S,2.0,1.0
5,6,0,3,male,,0,Q,2.0,2.0
6,7,0,1,male,54.0,0,S,2.0,1.0
7,8,0,3,male,2.0,3,S,2.0,1.0
8,9,1,3,female,27.0,0,S,1.0,1.0
9,10,1,2,female,14.0,1,C,1.0,3.0


In [None]:
## survival rate by number of Pclass
fcs.get_group_probabilities(df_train, "Pclass")

In [None]:
## survival rate by number of Sex
fcs.get_group_probabilities(df_train, "Sex")

In [None]:
## survival rate by number of location of embarkment
fcs.get_group_probabilities(df_train, "Embarked")

In [6]:
## survival rate by number of siblings 
fcs.get_group_probabilities(df_train, "SibSp")

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


Unnamed: 0_level_0,survival rate,people in group
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.345395,608
1,0.535885,209
2,0.464286,28
3,0.25,16
4,0.166667,18
5,0.0,5
8,0.0,7


### Conclusions
Both the sex and social class are important parameters to predict whether someone survives. 
Also embarkment seems to play a role survival rate.

In [7]:
## create simple model (Naive bayes)
X = fcs.get_features(df_train, ["Pclass", "Sex_num"])
Y = np.array(df_train["Survived"])
#print(Y)

clf = CategoricalNB()
model = clf.fit(X, Y)

In [8]:
results = model.predict(X)
#print(results)

In [9]:
error_rate = np.sum(np.abs(Y-results))/len(results)
print(1. - error_rate)

0.7867564534231201
