Gaussian Naive Bayes

Learned:
- Broadcasting: essentially, by adding dimension to dataset, np infers calculation and fills in gaps, add dimension where you want the iteration, so for X - u, want delta between each feature vector and means of all the classes, so add dimension to rows, as now each row is 2D matrix of differences
- np has some super useful interactions, usually way to get around iterating
- NB good intro/first stab at problem

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
class GNB:
    def __init__(self, var_smoothing = 1e-9):
        #set var smoothing
        if not(isinstance(var_smoothing,float)):
            raise TypeError("Not valid float")
        else:
            self.var_smoothing = var_smoothing

    
    def fit(self, X, Y):
        #convert X, Y df's to numpy arrays
        self.X_train = X.to_numpy()
        self.Y_train = Y.to_numpy()

        #identify classes and #
        self.classes = np.unique(self.Y_train)
        self.num_classes = len(np.unique(self.Y_train))

        #initialize means, vars, and priors into matrix
        self.means = np.zeros((self.num_classes,X.shape[1]))
        self.vars = np.zeros((self.num_classes,X.shape[1]))
        self.class_priors = np.zeros(self.num_classes)

        for i, cls in enumerate(self.classes):
            #for each class, take input data
            X_cls = X[Y == cls]

            #for corresponding index to class, take the mean of each col
            self.means[i, :] = np.mean(X_cls, axis=0)
            #take variance and add smoohting to ensure numerical stability
            self.vars[i, :] = np.var(X_cls, axis=0) + self.var_smoothing
            #proportion of data in class / total data
            self.class_priors[i] = len(X_cls) / len(X)
        
    def predict(self, X):
        #convert to numpy array
        self.X_pred = X.to_numpy()

        #calculate log probabilities for numerical stability and computational efficiency
        #P(Y)
        log_class_priors = np.log(self.class_priors)
        #use broadcasting and matrix/vector operations to avoid hassle
        log_likelihoods = -0.5 * (np.log(2 * np.pi * self.vars) + ((self.X_pred[:, np.newaxis, :] - self.means) ** 2 / self.vars))

        #sum log likelihoods across features, so 2D matrix, each row is datum, each is likelihood for the class
        #P(X|Y)
        log_likelihoods = np.sum(log_likelihoods, axis=2)
        
        #calculate posterior probabilities
        log_posteriors = log_class_priors + log_likelihoods

        #take arg max across rows, so find column index that yields max value, then find class
        return self.classes[np.argmax(log_posteriors, axis=1)]

Testing

In [26]:
my_GNB = GNB()
df = pd.read_csv("heart.csv")
X = df[['trtbps','chol']]
Y = df["output"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
my_GNB.fit(X_train,Y_train)
Y_pred = my_GNB.predict(X_test)
accuracy = accuracy_score(Y_pred, Y_test)
print(accuracy)

0.639344262295082
