In [17]:
%matplotlib inline

In [18]:
import numpy as np
import pandas as pd

In [19]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

In [20]:
train_df = pd.read_csv('../data/processed/train.csv')
train_df = train_df.drop(['PassengerId'], 1)
train_df = train_df.drop(['SibSp', 'Parch'], 1)

In [22]:
test_df = pd.read_csv('../data/processed/test.csv')
test_df = test_df.drop(['SibSp', 'Parch'], 1)

In [23]:
train_df.head(2)

Unnamed: 0,Survived,Pclass,Age,Fare,female,male
0,0,3,22.0,7.25,0.0,1.0
1,1,1,38.0,71.2833,1.0,0.0


In [24]:
test_df.head(2)

Unnamed: 0,PassengerId,Pclass,Age,Fare,female,male
0,892,3,34.5,7.8292,0.0,1.0
1,893,3,47.0,7.0,1.0,0.0


In [25]:
train_cols = [col for col in train_df.columns if col not in ['Survived']]
print train_cols

['Pclass', 'Age', 'Fare', 'female', 'male']


In [26]:
Y = train_df['Survived']
X = train_df[train_cols]

In [27]:
best_i = 1
best_model = 0.0

In [28]:
for i in xrange(1, len(train_cols)+1):
    print i
    select = SelectKBest(k = i)
    model = GaussianNB()
    pipeline = make_pipeline(select, model)

    pipeline.fit(X, Y)
    predictions = pipeline.predict(X)
    predict_proba = pipeline.predict_proba(X)[:,1]

    cv_score = cross_validation.cross_val_score(pipeline, X, Y, cv= 10)
    print("Accuracy : %.4g" % metrics.accuracy_score(Y.values, predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(Y, predict_proba))
    print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score), np.std(cv_score), 
            np.min(cv_score),
            np.max(cv_score)))
    print '\n'
    model_score = metrics.accuracy_score(Y.values, predictions) 
    model_score += metrics.roc_auc_score(Y, predict_proba) 
    model_score += metrics.roc_auc_score(Y, predict_proba)
    if model_score > best_model:
        best_model = model_score
        best_i = i

1
Accuracy : 0.7868
AUC Score (Train): 0.766873
CV Score : Mean - 0.7866982 | Std - 0.02794224 | Min - 0.741573 | Max - 0.8426966


2
Accuracy : 0.7868
AUC Score (Train): 0.766873
CV Score : Mean - 0.7866982 | Std - 0.02794224 | Min - 0.741573 | Max - 0.8426966


3
Accuracy : 0.7868
AUC Score (Train): 0.832835
CV Score : Mean - 0.7866982 | Std - 0.02794224 | Min - 0.741573 | Max - 0.8426966


4
Accuracy : 0.7811
AUC Score (Train): 0.818378
CV Score : Mean - 0.7811179 | Std - 0.02999017 | Min - 0.741573 | Max - 0.8426966


5
Accuracy : 0.7811
AUC Score (Train): 0.831810
CV Score : Mean - 0.7811179 | Std - 0.02999017 | Min - 0.741573 | Max - 0.8426966




In [29]:
select = SelectKBest(k = best_i)
model = GaussianNB()
pipeline = make_pipeline(select, model)

pipeline.fit(X, Y)
predictions = pipeline.predict(X)
predict_proba = pipeline.predict_proba(X)[:,1]

In [30]:
X.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0.0,1.0
1,1,38.0,71.2833,1.0,0.0
2,3,26.0,7.925,1.0,0.0
3,1,35.0,53.1,1.0,0.0
4,3,35.0,8.05,0.0,1.0


In [31]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,female,male
0,892,3,34.5,7.8292,0.0,1.0
1,893,3,47.0,7.0,1.0,0.0
2,894,2,62.0,9.6875,0.0,1.0
3,895,3,27.0,8.6625,0.0,1.0
4,896,3,22.0,12.2875,1.0,0.0


In [32]:
test_noid = test_df[test_df.columns[1:]]

In [33]:
test_df['Survived'] = pipeline.predict(test_noid)

In [34]:
output_cols = ['PassengerId', 'Survived']

In [35]:
output = test_df[output_cols]
output.to_csv('../data/results_nb.csv', index=False)