In [12]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
from sklearn.naive_bayes import MultinomialNB

def prob1():
    seed_data = np.loadtxt("seeds_dataset.txt")
    # Last column is the type of wheat. First seven are the features
    # 0: Area, 1: Perimeter, 2: Compactness, 3: Length, 4: Width, 5: Asymmetry
    # 6: Groove length

    training, test = cross_validation.train_test_split(seed_data,test_size=40)
    probs = np.bincount(training[:,-1].astype(np.int64))[1:].astype(np.float64)/training.shape[0]
    logprobs = np.log(probs)

    mu = np.vstack([np.mean(training[training[:,-1]==i],axis=0) for i in xrange(1,4)])[:,:-1]
    sig = np.vstack([np.var(training[training[:,-1]==i],axis=0) for i in xrange(1,4)])[:,:-1]
    
    # c_i's are the possible labels 1-3
    # x_j's are the features 0-6
    # c = argmax over i\in{1,...,k} log(P(c_i))+sum_j=1^n logP(x_j|c_i)
    
    # P(x_j|c_i) = np.exp(-(x_j-mu[i,j])**2/(2.0*sig[i,j]))/np.sqrt(2*np.pi*sig[i,j])
    
    labels = test[:,-1]
    predictions = np.zeros_like(labels)
    for j in xrange(40):
        x = test[j,:-1]
    
        test_probs = np.zeros(3)
        for i in xrange(3):
            test_probs[i] = logprobs[i] + np.sum(np.log(np.exp(-(x-mu[i,:])**2/(2.0*sig[i,:]))/np.sqrt(2*np.pi*sig[i,:])))
        predictions[j] = np.argmax(test_probs) + 1
    
    accuracy = 100 - np.count_nonzero(labels-predictions)*2.5
    print "Seed data, % accuracy of my classifier: ", accuracy
    return accuracy

def prob2():
    seed_data = np.loadtxt("seeds_dataset.txt")
    training, test = cross_validation.train_test_split(seed_data,test_size=40)
    
    nb_classifier = GaussianNB()
    nb_classifier.fit(training[:,:-1],training[:,-1])
    
    labels = test[:,-1]
    predictions = nb_classifier.predict(test[:,:-1])
    
    accuracy = 100 - np.count_nonzero(labels-predictions)*2.5
    print "Seed data, % accuracy of sklearn classifier: ", accuracy
    return accuracy

class naiveBayes(object):
    """
    This class performs Naive Bayes classification for word-count document←-
    features.
    """

    def __init__(self):
        """
        Initialize a Naive Bayes classifier.
        """
        #self.nb_classifier = GaussianNG()

    def fit(self,X,Y):
        """
        Fit the parameters according to the labeled training data (X,Y).
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
        Each row is the word-count vector for one of the documents
        Y : ndarray of shape (n_samples,)
        Gives the class label for each instance of training data. Assume class labels
        are in {0,1,...,k-1} where k is the number of classes.
        """
        # get prior class probabilities P(c_i)
        # (you may wish to store these as a length k vector as a class attribute)
        
        # get (smoothed) word-class probabilities
        # (you may wish to store these in a (k, n_features) matrix as a class attribute)
        self.n_samples, self.n_features = X.shape
        self.P = np.array([(Y==i).mean() for i in set(Y)])
        self.n_classes = len(set(Y))
        
        self.p = np.array([(X[Y==i]).sum(axis=0)+1 for i in xrange(self.n_classes)])
        self.p /= self.p.sum(axis=1).reshape(self.p.shape[0],1)

    def predict(self, X):
        """
        Predict the class labels of a set of test data.
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
        The test data
        Returns
        -------
        Y : ndarray of shape (n_samples,)
        Gives the classification of each row in X
        """
        return np.argmax(np.log(self.P) + X.dot(np.log(self.p).T),axis=1)
    
def prob4():
    features = np.loadtxt("SpamFeatures.txt")
    labels = np.loadtxt("SpamLabels.txt")
    
    nb = naiveBayes()
    
    test_indices = np.random.randint(0,features.shape[0],500)
    train_indices = np.array(list(set(range(features.shape[0])) - set(test_indices)))
    
    train_vectors = features[train_indices]
    train_labels = labels[train_indices]
    
    test_vectors = features[test_indices]
    test_labels = labels[test_indices]
    nb.fit(train_vectors, train_labels)
    
    nb_predicted = nb.predict(test_vectors)
    
    print "Spam data, my accuracy: ", 100*np.mean(nb_predicted == test_labels)
    
    # assume train_vectors, train_labels, and test_vectors are defined
    mnb = MultinomialNB()
    mnb.fit(train_vectors, train_labels)
    mnb_predicted = mnb.predict(test_vectors)
    
    print "Spam data, MultinomialNB accuracy: ", 100*np.mean(mnb_predicted == test_labels)

prob1()
prob2()

prob4()

Seed data, % accuracy of my classifier:  92.5
Seed data, % accuracy of sklearn classifier:  85.0
Spam data, my accuracy:  96.0
Spam data, MultinomialNB accuracy:  96.0
