In [1]:
from collections import Counter, defaultdict
import numpy as np
from scipy import misc, linalg
from imp import reload
from labfuns import *
import random
from math import log

In [2]:
def mlParams(X, labels, W=None):
    """
    param X: N x d matrix of N data points
    param labels: N vector of class labels
    return: mu - C x d matrix of class means (mu[i] - class i mean)
    return: sigma - C x d x d matrix of class covariances (sigma[i] - class i sigma)
    """
    assert(X.shape[0]==labels.shape[0])
    Npts,Ndims = np.shape(X)
    classes = np.unique(labels)
    Nclasses = np.size(classes)

    if W is None:
        W = np.ones((Npts,1))/float(Npts)

    mu = np.zeros((Nclasses,Ndims))
    sigma = np.zeros((Nclasses,Ndims,Ndims))

    for class_idx, class_ in enumerate(classes):
        idx = np.where(labels==class_)[0] # Extract the indices for which y==class is true,
        xlc = X[idx,:]
        mu[class_idx] = xlc.sum(axis=0) / len(xlc)
        for dim_idx in range(Ndims):
            var = 0
            for class_vect in xlc:
                var += (class_vect[dim_idx] - mu[class_idx][dim_idx])**2
            var += var / len(xlc)
            sigma[class_idx, dim_idx, dim_idx] = var

    return mu, sigma

In [3]:
def computePrior(labels, W=None):
    """
    param labels: N vector of class labels.
    return: prior - C x 1 vector of class priors
    """
    Npts = labels.shape[0]
    if W is None:
        W = np.ones((Npts,1))/Npts
    else:
        assert(W.shape[0] == Npts)
    classes = np.unique(labels)
    Nclasses = np.size(classes)

    prior = np.zeros((Nclasses,1))
    
    for class_idx, class_ in enumerate(classes):
        prior[class_idx] = np.where(labels==class_)[0].shape[0]/Npts

    return prior

In [4]:
def classifyBayes(X, prior, mu, sigma):
    """
    param X: N x d matrix of M data points
    param prior: C x 1 matrix of class priors
    param mu: C x d matrix of class means (mu[i] - class i mean)
    param sigma: C x d x d matrix of class covariances (sigma[i] - class i sigma)
    returns: h N vector of class predictions for test points
    """

    Npts = X.shape[0]
    Nclasses,Ndims = np.shape(mu)
    logProb = np.zeros((Nclasses, Npts))   
    
    for data_idx, data_vector in enumerate(X):
        prediction_vector = []
        for class_idx, (class_prior, class_mu, class_sigma) in enumerate(zip(prior, mu, sigma)):
            first_term = -0.5*(log(abs(class_sigma.diagonal().prod()))) 
            third_term = log(class_prior)
            inverse_sum_term = 0
            for vector_value, diag_value, mu_value in zip(data_vector, class_sigma.diagonal(), class_mu):
                inverse_sum_term += (vector_value - mu_value)**2*(1/2*diag_value)
            discriminant_function = first_term - inverse_sum_term + third_term
            logProb[class_idx, data_idx] = discriminant_function

    h = np.argmax(logProb,axis=0)
          
    return h

In [5]:
X_iris, labels_iris, pcadim_iris = fetchDataset('iris')
X_vowel, labels_vowel, pcadim_vowel = fetchDataset('vowel')

In [6]:
mu_iris, sigma_iris = mlParams(X_iris, labels_iris)
mu_vowel, sigma_vowel = mlParams(X_vowel, labels_vowel)


In [7]:
prior_iris = computePrior(labels_iris)
prior_vowel = computePrior(labels_vowel)

In [8]:
predictions_iris = classifyBayes(X_iris,prior_iris, mu_iris, sigma_iris)
predictions_vowel = classifyBayes(X_vowel ,prior_vowel, mu_vowel, sigma_vowel)

In [10]:
def recall_accuracy(predictions, labels):
    ground_truth_and_prediction_by_class = defaultdict(list)
    correct_count_by_class = defaultdict(int)
    prediction_and_ground_truth_by_class = defaultdict(list)
    correct_count_by_prediction_class = defaultdict(int) 

    for prediction_idx, prediction in enumerate(predictions):
    #ground_truth_and_prediction_by_class[ground_truth].append(predictions[ground_truth_idx])
        prediction_and_ground_truth_by_class[prediction].append((prediction, labels[prediction_idx]))
        if prediction == labels[prediction_idx]:
            correct_count_by_prediction_class[prediction] += 1

    for ground_truth_idx, ground_truth in enumerate(labels):
    #ground_truth_and_prediction_by_class[ground_truth].append(predictions[ground_truth_idx])
        ground_truth_and_prediction_by_class[ground_truth].append((ground_truth,predictions[ground_truth_idx]))
        if ground_truth == predictions[ground_truth_idx]:
            correct_count_by_class[ground_truth] += 1

    print(correct_count_by_prediction_class)
    #accuracy
    for class_val, class_corr_count in correct_count_by_prediction_class.items():
        print(class_corr_count / len(prediction_and_ground_truth_by_class[class_val]))
    
    print(correct_count_by_class)
    #recall
    for class_val, class_corr_count in correct_count_by_class.items():
        print(class_corr_count / len(ground_truth_and_prediction_by_class[class_val]))

#test==labels_iris
#counter = Counter(test==labels_iris)    

In [11]:
# every time I predicted a 0 92% of the times was correct, 76% was correct when class 1 was predicted 
# 90% of the times class 2 was predicted was correct.
recall_accuracy(predictions_iris, labels_iris)

defaultdict(<class 'int'>, {0: 50, 1: 42, 2: 37})
0.9259259259259259
0.7636363636363637
0.9024390243902439
defaultdict(<class 'int'>, {0: 50, 1: 42, 2: 37})
1.0
0.84
0.74


In [159]:
ground_truth_and_prediction_by_class = defaultdict(list)
correct_count_by_class = defaultdict(int)
prediction_and_ground_truth_by_class = defaultdict(list)
correct_count_by_prediction_class = defaultdict(int) 

In [160]:
for prediction_idx, prediction in enumerate(predictions):
    #ground_truth_and_prediction_by_class[ground_truth].append(predictions[ground_truth_idx])
    prediction_and_ground_truth_by_class[prediction].append((prediction,labels_iris[prediction_idx]))
    if prediction == labels_iris[prediction_idx]:
        correct_count_by_prediction_class[prediction] += 1

In [151]:
for ground_truth_idx, ground_truth in enumerate(labels_iris):
    #ground_truth_and_prediction_by_class[ground_truth].append(predictions[ground_truth_idx])
    ground_truth_and_prediction_by_class[ground_truth].append((ground_truth,predictions[ground_truth_idx]))
    if ground_truth == predictions[ground_truth_idx]:
        correct_count_by_class[ground_truth] += 1

In [161]:
#test==labels_iris
#counter = Counter(test==labels_iris)
print(correct_count_by_prediction_class)
#accuracy
for class_val, class_corr_count in correct_count_by_prediction_class.items():
    print(class_corr_count / len(prediction_and_ground_truth_by_class[class_val]))
    


defaultdict(<class 'int'>, {0: 50, 1: 42, 2: 37})
0.9259259259259259
0.7636363636363637
0.9024390243902439


In [157]:
print(correct_count_by_class)
#recall
for class_val, class_corr_count in correct_count_by_class.items():
    print(class_corr_count / len(ground_truth_and_prediction_by_class[class_val]))

defaultdict(<class 'int'>, {0: 50, 1: 42, 2: 37})
1.0
0.84
0.74
