## Naive Bayes

In [41]:
import numpy as np
from numpy.testing import assert_array_equal, assert_array_almost_equal

### Separate Data by Class

In [136]:
def separate_by_class(X, y):
    '''
    Separate the training set ("X") by class value ("y")
    so that we can calculate statistics for each class.
    
    Parameters
    ----------
    X: A 2d numpy array
    y: A 1d numpy array
    Returns
    -------
    A dictionary of 2d numpy arrays
    '''
    
    d=np.column_stack((X, y))
    separated = {}
    for i in range(len(d)):
        vector = d[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector[:-1])
    for j in range(len(separated)):
        separated[j]=np.vstack(separated[j])
    return separated

In [137]:
X_t = np.array( [[2, 21], [1, 20], [3, 22]] )
y_t = np.array( [1, 0, 1] )
separated_t = separate_by_class(X_t, y_t)
assert_array_equal(separated_t[0], np.array( [ [1, 20] ] ))
assert_array_equal(separated_t[1], np.array( [ [2, 21], [3, 22] ] ))

### Calculate Mean

In [140]:
def calculate_mean(array):
    '''
    Calculates the mean of each column, i.e. each attribute.
    
    Parameters
    ----------
    A 1d or 2d numpy array
    
    Returns
    -------
    A 1d or 2d numpy array
    '''
    
    mean=np.mean(array,axis=0)
    return mean

In [141]:
array_t = np.array( [ [1, 4, 7], [2, 5, 6], [3, 6, 8] ] )
mean_t = calculate_mean(array_t)
assert_array_equal(mean_t, np.array( [2., 5., 7.] ))

### Calculate Standard Deviation

In [142]:
def calculate_stdev(array):
    '''
    Calculates the standard deviation (N-1 method) of each column, i.e. each attribute.

    Parameters
    ----------
    A 1d or 2d numpy array
    
    Returns
    -------
    A 1d or 2d numpy array
    '''
    
    stdev=np.std(array,axis=0,ddof=1)
    return stdev

In [143]:
array_t = np.array( [ [1, 20, 14], [2, 21, 15], [3, 22, 16] ] )
stdev_t = calculate_stdev(array_t)
assert_array_equal(stdev_t, np.array( [1., 1., 1.] ))

In [144]:
X_t = np.array( [ [1, 20], [2, 21], [3, 22] ] )

### Summarize Data Set

In [176]:
def summarize(X):
    '''
    For a given list of instances (for a class value),
    calculates the mean and the standard deviation for each attribute.
    
    Parameters
    ----------
    A 2d numpy array
    
    Returns
    -------
    A 2d numpy array
    '''
    
    a=calculate_mean(X)
    b=calculate_stdev(X)
    summary=np.stack((a, b), axis=1)
    return summary

In [177]:
X_t = np.array( [ [1, 20], [2, 21], [3, 22] ] )
summary_t = summarize(X_t)
assert_array_equal(summary_t, np.array( [ (2.0, 1.0), (21.0, 1.0) ] ))

In [178]:
X_t = np.array( [ [1, 20], [2, 21], [3, 22], [4, 22] ] )
y_t = np.array( [1, 0, 1, 0] )

In [180]:
new=separate_by_class(X_t,y_t)

In [182]:
summarize(new[0])

array([[  3.        ,   1.41421356],
       [ 21.5       ,   0.70710678]])

### Summarize Attributes By Class

In [190]:
def summarize_by_class(X, y):
    '''
    Separates a training set into instances grouped by class.
    It then calculates the summaries for each attribute.
    
    Parameters
    ----------
    X: A 2d numpy array. Represents training attributes.
    y: A 1d numpy array. Represents class labels.
    Returns
    -------
    A dictionary of 2d numpy arrays. Uses each class label as keys
    and summary for each class label as values.
    '''
    
    new=separate_by_class(X,y)
    result={}
    for i in range((len(new))):
        result[i]=summarize(new[i])
    return result

In [191]:
X_t = np.array( [ [1, 20], [2, 21], [3, 22], [4, 22] ] )
y_t = np.array( [1, 0, 1, 0] )
summaries_t = summarize_by_class(X_t, y_t)
assert_array_almost_equal(summaries_t[0], np.array( [ (3., 1.41421356), (21.5, 0.70710678) ] ))
assert_array_almost_equal(summaries_t[1], np.array( [ (2., 1.41421356), (21.0, 1.41421356) ] ))

### Calculate Log of Gaussian Probability Density Function

In [272]:
def calculate_log_probability(x, mean, stdev):
    '''
    Calculates log of Gaussian function to estimate
    the log probability of a given attribute value.
    Assume x, mean, stdev have the same length.
    
    Parameters
    ----------
    x: A float or 1d numpy array
    mean: A float or 1d numpy array
    stdev: A float or 1d numpy array
    
    Returns
    -------
    A float or 1d numpy array
    '''
    
    exponent = np.exp(-(np.power(x-mean,2)/(2*np.power(stdev,2))))
    log_probability=np.log((1 / (np.sqrt(2*np.pi) * stdev)) * exponent)
    return log_probability

In [288]:
array_t = calculate_log_probability(np.array( [71.5] ), np.array( [73] ), np.array( [6.2] ))
assert_array_almost_equal(array_t, np.array( [ -2.7727542144336588 ] ))

array_t2 = calculate_log_probability(np.array( [1, 2] ), np.array( [3, 4] ), np.array( [5, 6] ))
assert_array_almost_equal(array_t2, np.array( [-2.60837645, -2.76625356] ))

### Calculate Class Probabilities

In [326]:
def calculate_class_log_probabilities(summaries, input_array):
    '''
    Combines the probabilities of all of the attribute values for a data instance
    and comes up with a probability of the entire data instance belonging to the class.

    Parameters
    ----------
    summaries: A dictionary of 2d numpy arrays
    input_array: A numpy array of instances; each instance is a numpy array of attributes
    
    Returns
    -------
    A dictionary of 1d numpy arrays of summed log probabilities
    '''
    
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = []
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            for j in range(len(input_array)):
                x = input_array[j]
                y=calculate_log_probability(x, mean, stdev)
                probabilities[classValue].append(y)
    for k in range(len(probabilities)):
        probabilities[k]=np.vstack(probabilities[k]).flatten()
    return probabilities

In [327]:
summaries_t = {0: np.array( [ (1, 0.5) ]), 1: np.array( [ (20, 5.0) ] )}
input_t = np.array( [[1.1]] )
log_probabilities = calculate_class_log_probabilities(summaries_t, input_t)
assert_array_almost_equal(log_probabilities[0], np.array( [-0.24579135264472743] ))
assert_array_almost_equal(log_probabilities[1], np.array( [-9.6725764456387715] ))

input_t2 = np.array( [[4], [.9], [0]] )
log_probabilities2 = calculate_class_log_probabilities(summaries_t, input_t2)
assert_array_almost_equal(log_probabilities2[0], np.array( [-18.225791352644727, -0.24579135264472729, -2.2257913526447273] ))
assert_array_almost_equal(log_probabilities2[1], np.array( [-7.6483764456387728, -9.8245764456387743, -10.528376445638774] ))

### Make Predictions

In [469]:
def predict(summaries, input_array):
    '''
    Calculates the probability of each data instance belonging to each class value,
    looks for the largest probability, and return the associated class.
    
    Parameters
    ----------
    summaries: A dictionary of numpy arrays
    input_array: A numpy array of instances; each instance is a numpy array of attributes
    
    Returns
    -------
    A 1d numpy array
    '''
    
    prob = calculate_class_log_probabilities(summaries, input_array)
    new=pd.DataFrame(prob)
    return np.array(new.idxmax(axis=1))

In [472]:
summaries_t = {0: np.array( [ (1, 0.5) ] ), 1: np.array( [ (20, 5.0) ] )}
input_t1 = np.array( [[1.1]] )
result_t1 = predict(summaries_t, input_t1)
assert_array_equal(result_t1, np.array( [0.] ))

test_set_t2 = np.array( [[1.1], [19.1]] )
result_t2 = predict(summaries_t, test_set_t2)
assert_array_equal(result_t2, np.array( [0., 1.] ))

test_set_t3 = np.array( [[4], [.9], [0]] )
result_t3 = predict(summaries_t, test_set_t3)
assert_array_equal(result_t3, np.array( [1., 0., 0.] ))