In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline
from __future__ import division
from collections import Counter

In [2]:
# Read the training data
raw = pd.read_table( '../input/train.csv', delimiter=',')

In [3]:
raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [4]:
# Check which columns have missing values
raw.apply(lambda x: sum(x.isnull())/len(x))

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [5]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [6]:
def print_cats(df):
    for col in ['Pclass', 'Sex', 'Embarked']:
        print col
        print Counter(df[col]).most_common()

In [7]:
print_cats(raw)

Pclass
[(3, 491), (1, 216), (2, 184)]
Sex
[('male', 577), ('female', 314)]
Embarked
[('S', 644), ('C', 168), ('Q', 77), (nan, 2)]


In [144]:
def clean(raw):
    cleaned = raw.copy()
    cleaned.loc[cleaned['Age'].isnull(), ['Age']] = np.nanmedian(raw['Age'])
    cleaned.loc[cleaned['Embarked'].isnull(),['Embarked']] = Counter(raw['Embarked']).most_common(1)[0][0]
    cleaned['IsChild'] = 1.0*(cleaned['Age'] < 20)
    cleaned['IsFemale'] = 1.0*(cleaned['Sex'] == 'male')
    cleaned['IsUpperClass'] = 1.0*(cleaned['Pclass']==1)
    return cleaned

In [145]:
cleaned = clean(raw)
cleaned.apply(lambda x: sum(x.isnull())/len(x))

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age             0.000000
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin           0.771044
Embarked        0.000000
IsChild         0.000000
IsFemale        0.000000
IsUpperClass    0.000000
dtype: float64

In [146]:
print_cats(cleaned)

Pclass
[(3, 491), (1, 216), (2, 184)]
Sex
[('male', 577), ('female', 314)]
Embarked
[('S', 646), ('C', 168), ('Q', 77)]


In [57]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [147]:
nTrain = 891
features = ['IsFemale', 'IsChild', 'IsUpperClass']

In [148]:
cleaned.groupby(features)['Survived'].mean()

IsFemale  IsChild  IsUpperClass
0         0        0               0.641509
                   1               0.975000
          1        0               0.655738
                   1               0.928571
1         0        0               0.112601
                   1               0.356522
          1        0               0.268293
                   1               0.571429
Name: Survived, dtype: float64

In [149]:
clf = MultinomialNB()
clf.fit(cleaned[features].values, cleaned['Survived'].values)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [150]:
confusion_matrix(cleaned.Survived.values,clf.predict(cleaned[features].values))

array([[522,  27],
       [207, 135]])

In [151]:
accuracy_score(cleaned.Survived.values,clf.predict(cleaned[features].values), normalize=True) 

0.73737373737373735

In [152]:
# Predict the test set
test = pd.read_table('../input/test.csv', delimiter=',')
cleaned_test = clean(test)
cleaned_test['Survived'] = clf.predict(cleaned_test[features].values )

In [153]:
cleaned_test[['PassengerId', 'Survived']].describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.188995
std,120.810458,0.391974
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,0.0
max,1309.0,1.0


In [160]:
cleaned_test.to_csv('naive_bayes_prediction.csv', columns=['PassengerId', 'Survived'], index=False)