In [None]:
import numpy as np
import util
import pickle
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
import pdb

In [None]:
trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile, impute=True)

seed=229
np.random.seed(seed)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Training

In [None]:
clf = GaussianNB()
clf.fit(X, Y)

In [None]:
gnb_out = open('models/gaussian_nb.pickle', 'wb')
pickle.dump(clf, gnb_out)
gnb_out.close()

# Evaluation

In [None]:
clf = pickle.load(open('models/gaussian_nb.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(clf, X_train, X_test, y_train, y_test,
                                                       scorer=util.gini_proba_scorer)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Gaussian Naive Bayes')
plt.legend()
plt.savefig('figures/learning_curves/gaussianNB.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Output

In [None]:
clf.fit(X, Y)
print(util.gini_proba_scorer(clf, X, Y))

In [None]:
util.make_prediction(clf, 'data/test.csv', 'predictions/gaussian_nb.csv',
                     predict_method=util.proba_method(clf))

# Multinomial Naive Bayes (unused)

In [None]:
# dlf = MultinomialNB()
# dlf.fit(np.maximum(X,0), Y) # some -1's in X indicate unknown values; need to handle these

# print(util.gini_scorer(dlf, np.maximum(X,0), Y))


In [None]:
# #util.make_prediction(clf, 'data/test.csv', 'predictions/multinomial_nb.csv',
#                      lambda X_test: dlf.predict_proba(X_test)[:,1])