# Notebook to predict survival of passengers

First model: try Pclass & sex to predict (they have no missing values)
Then include embarkment: check how sklearn handles missing values

In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
from sklearn.naive_bayes import CategoricalNB #BernoulliNB

In [3]:
#### User input ####
train_data = "../data/titanic/train.csv"

## the columns used for training
cols_read = ['PassengerId','Survived', 'Pclass', 'Sex', 'Age','Embarked']

## schema for the training dataset
train_schema = {'PassengerId': np.int16,
               'Survived': np.int16,
               'Pclass': np.int16,
               'Sex': str,
               'Age': np.float64,
               'Embarked': str}

In [4]:
## read data into Dataframe
df_train = pd.read_csv(train_data, usecols = cols_read, dtype = train_schema)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int16  
 1   Survived     891 non-null    int16  
 2   Pclass       891 non-null    int16  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   Embarked     889 non-null    object 
dtypes: float64(1), int16(3), object(2)
memory usage: 26.2+ KB


In [5]:
def trans_cols(df, col_name, new_name, trans_dict):
    for key in trans_dict.keys():
        df.loc[df[col_name] == key, new_name] = trans_dict[key]
    return df

In [6]:
#### User input ####
sex_dict = {"female": 1,
           "male": 2}

emb_dict = {"S": 1,
           "Q": 2,
           "C": 3}

In [7]:
## create numeral values for the enumerated classes
df_train = trans_cols(df_train, "Sex", "Sex_num", sex_dict)
df_train = trans_cols(df_train, "Embarked", "Embarked_num", emb_dict)
df_train["Embarked_num"].fillna(0, inplace=True)
df_train.info()
df_train.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int16  
 1   Survived      891 non-null    int16  
 2   Pclass        891 non-null    int16  
 3   Sex           891 non-null    object 
 4   Age           714 non-null    float64
 5   Embarked      889 non-null    object 
 6   Sex_num       891 non-null    float64
 7   Embarked_num  889 non-null    float64
dtypes: float64(3), int16(3), object(2)
memory usage: 40.2+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Sex_num,Embarked_num
0,1,0,3,male,22.0,S,1.0,0.0
1,2,1,1,female,38.0,C,0.0,2.0
2,3,1,3,female,26.0,S,0.0,0.0
3,4,1,1,female,35.0,S,0.0,0.0
4,5,0,3,male,35.0,S,1.0,0.0
5,6,0,3,male,,Q,1.0,1.0
6,7,0,1,male,54.0,S,1.0,0.0
7,8,0,3,male,2.0,S,1.0,0.0
8,9,1,3,female,27.0,S,0.0,0.0
9,10,1,2,female,14.0,C,0.0,2.0


In [8]:
## survival rates as a function of category

## by Pclass
pcl_df_count = df_train.groupby("Pclass")["Survived"].count()
pcl_df_sum = df_train.groupby("Pclass")["Survived"].sum()
print(pcl_df_sum/pcl_df_count)

## by sex 
sex_df_count = df_train.groupby("Sex")["Survived"].count()
sex_df_sum = df_train.groupby("Sex")["Survived"].sum()
print(sex_df_sum/sex_df_count)

## by embarkment 
emb_df_count = df_train.groupby("Embarked")["Survived"].count()
emb_df_sum = df_train.groupby("Embarked")["Survived"].sum()
print(emb_df_sum/emb_df_count)

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64
Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


### Conclusions
Both the sex and social class are important parameters to predict whether someone survives. 
Also embarkment seems to play a role survival rate.

In [9]:
## create simple model (Naive bayes)
X = np.array([np.array(df_train["Pclass"]), np.array(df_train["Sex_num"]), np.array(df_train["Embarked_num"])]).T
Y = np.array(df_train["Survived"])
#print(Y)

clf = CategoricalNB()
model = clf.fit(X, Y)

ValueError: Input X contains NaN.
CategoricalNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
results = model.predict(X)
#print(results)

In [None]:
error_rate = np.sum(np.abs(Y-results))/len(results)
print(1. - error_rate)