In [17]:
import numpy as np
from scipy import stats
from sklearn import datasets
from sklearn import model_selection as ms
from sklearn.naive_bayes import GaussianNB
import time

# Problem 1
Write a binary naive Bayes classifier from scratch to classify data that has normally distributed features ($x$) and binary outputs ($y$). Take care to prevent underflow in the very tiny products that occur in the intermediate computations.

In [12]:
def binary_naive_bayes(x, y):
    '''
    this function accepts training data x and y
    then returns a new function that will classify
    any new data x. Assumes each feature is normally
    distributed.
    
    n - number of observations
    d - number of features
    x is an nxd array of normally distributed features
    y is an nx1 array of ones and zeros
    '''
    # y=1
    x1 = x[y==1]
    mu1 = []
    sigma1 = []
    for i in range(len(x1[0])):
        mu1.append(x1[:,i].mean())
        sigma1.append(x1[:,i].std())
    
    # y=0
    x0 = x[y==0]
    mu0 = []
    sigma0 = []
    for i in range(len(x1[0])):
        mu0.append(x0[:,i].mean())
        sigma0.append(x0[:,i].std())
        
    # create classifying fuction
    def classifier(Xtest):
        yhat = []
        for x in Xtest:
            # classify probability of y=0
            logprob0 = 0
            for k in range(len(x)):
                logprob0 += np.log(stats.norm.pdf(x[k], mu0[k], sigma0[k]))
            prob0 = np.exp(logprob0)*(1-y.mean())

            # classify probability of y=1
            logprob1 = 0
            for k in range(len(x)):
                logprob1 += np.log(stats.norm.pdf(x[k], mu1[k], sigma1[k]))
            prob1 = np.exp(logprob1)*y.mean()

            yhat.append(np.argmax([prob0, prob1]))
        return np.array(yhat)
    
    return classifier

# Problem 2
Apply your classifier to the `scikit-learn` cancer data set with a 70-30 train-test split.

In [19]:
cancer = datasets.load_breast_cancer()
X,y = cancer.data, cancer.target
xtrain, xtest, ytrain, ytest = ms.train_test_split(X,y, test_size = .7)

start = time.clock()
trained_model = binary_naive_bayes(xtrain, ytrain)
my_train_time = time.clock() - start

print("Accuracy:\t{}".format((trained_model(xtest)==ytest).mean()))
print("Training Time:\t{}".format(my_train_time))



Accuracy:	0.93984962406
Training Time:	0.00445578573181


# Problem 3
Compare your results (training time and test accuracy) to the `scikit-learn` naive bayes classifier.

In [20]:
gnb = GaussianNB()
start = time.clock()
gnb.fit(xtrain, ytrain)
sklearn_train_time = time.clock() - start

print("Accuracy:\t{}".format((gnb.predict(xtest)==ytest).mean()))
print("Training Time:\t{}".format(sklearn_train_time))

Accuracy:	0.937343358396
Training Time:	0.00127384354073


We can observe that Scikit Learn's training time is quicker than mine by about a factor of 4, but my accuracy is ever so slightly higher. This I am proud of.