## Titanic Kaggle Challange

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.shape

(891, 12)

In [4]:
test_df.shape

(418, 11)

#### Passenger Id and Name Should not be significant

In [5]:
train_df.drop(['PassengerId', 'Name','Ticket'], axis=1, inplace=True)
test_df.drop(['PassengerId', 'Name','Ticket'], axis=1, inplace=True)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [6]:
train_df.shape

(891, 9)

## Missing Values

In [7]:
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### Inputation Methods
- for Age use the average age

In [8]:
train_df['Age'].fillna((train_df['Age'].mean()), inplace=True)

In [9]:
train_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

- for Embarked set it to the mode since I don't know the meaning of S, C and Q. If one of them means not imbarked I would have set it

In [11]:
train_df['Embarked'].mode()[0]

'S'

In [12]:
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

#### What to do for cabin?

In [13]:
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [14]:
train_df['Cabin'].nunique()

147

In [15]:
train_df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

It appears 891-687=204 times with 147 values, it is not very meaningfull... drop it.

In [16]:
train_df.drop(['Cabin'], axis=1, inplace=True)
train_df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

#### now adjust the test set

In [17]:
test_df.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [18]:
test_df.drop(['Cabin'], axis=1, inplace=True)
test_df['Age'].fillna((train_df['Age'].mean()), inplace=True)

In [19]:
test_df['Fare'].nunique()

169

In [20]:
test_df['Fare'].mean()

35.6271884892086

In [21]:
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

In [22]:
test_df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

## I want to use Naive Bayes 
#### It is quite simple and does not require huge data preparation
#### Assumption we are makings: (hidden drowbacks)
 - Attributes are equally important
 - Attributes are statitsically indipendent

In [23]:
train_df.shape

(891, 8)

In [24]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


#### I use CategoricalNB for categorical variable and gaussianNB for continuos variable

In [25]:
from mixed_naive_bayes import MixedNB

In [26]:
X, y = train_df.drop('Survived', axis = 1), train_df['Survived']

In [27]:
X['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [28]:
X['Sex'].unique()

array(['male', 'female'], dtype=object)

In [29]:
X['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [30]:
X['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)