Learned:
- Broadcasting: essentially, by adding dimension to dataset, np infers calculation and fills in gaps, add dimension where you want the iteration, so for X - u, want delta between each feature vector and means of all the classes, so add dimension to rows, as now each row is 2D matrix of differences
- np has some super useful interactions, usually way to get around iterating
- NB good intro/first stab at problem
- Without broadcasting, easier to understand, but computationally less efficient

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

GNB

In [6]:
class GNB:
    def __init__(self, var_smoothing = 1e-9):
        #set var smoothing
        self.var_smoothing = var_smoothing

    
    def fit(self, X, Y):
        #convert X, Y df's to numpy arrays
        X_train = X.to_numpy()
        Y_train = Y.to_numpy()

        #identify classes and #
        self.classes = np.unique(Y_train)
        self.num_classes = len(np.unique(Y_train))

        #initialize means, vars, and priors into matrix
        self.means = np.zeros((self.num_classes,X.shape[1]))
        self.vars = np.zeros((self.num_classes,X.shape[1]))
        self.class_priors = np.zeros(self.num_classes)

        for i, cls in enumerate(self.classes):
            #for each class, take input data
            X_cls = X[Y == cls]

            #for corresponding index to class, take the mean of each col
            self.means[i, :] = np.mean(X_cls, axis=0)
            #take variance and add smoohting to ensure numerical stability
            self.vars[i, :] = np.var(X_cls, axis=0)
            #proportion of data in class / total data
            self.class_priors[i] = len(X_cls) / len(X)
            
        self.vars += self.var_smoothing
        
    def predict(self, X):
        #convert to numpy array
        X_pred = X.to_numpy()

        #calculate log probabilities for numerical stability and computational efficiency
        #P(Y)
        log_class_priors = np.log(self.class_priors)
        #use broadcasting and matrix/vector operations to avoid hassle
        log_likelihoods = -0.5 * (np.log(2 * np.pi * self.vars) + ((X_pred[:, np.newaxis, :] - self.means) ** 2 / self.vars))

        #sum log likelihoods across features, so 2D matrix, each row is datum, each is likelihood for the class
        #P(X|Y)
        log_likelihoods = np.sum(log_likelihoods, axis=2)
        
        #calculate posterior probabilities
        log_posteriors = log_class_priors + log_likelihoods

        #take arg max across rows, so find column index that yields max value, then find class
        return self.classes[np.argmax(log_posteriors, axis=1)]

GDA

In [7]:
class GDA:
    def __init__(self, var_smoothing=1e-9):
        # set var smoothing
        self.var_smoothing = var_smoothing

    def fit(self, X, Y):
        X_train = X.to_numpy()
        Y_train = Y.to_numpy()

        self.classes = np.unique(Y_train)
        self.num_classes = len(self.classes)
        self.num_features = X_train.shape[1]

        #set parameters, include var smoothing
        self.means_matrix = np.zeros((self.num_classes, self.num_features))
        self.cov_matrix = np.zeros((self.num_features, self.num_features))
        self.class_priors = np.zeros(self.num_classes)

        #calculate means, priors, and variance across classes
        for i, cls in enumerate(self.classes):
            X_cls = X_train[Y_train == cls]
            class_mean = np.mean(X_cls, axis=0)
            self.means_matrix[i, :] = class_mean
            self.class_priors[i] = X_cls.shape[0] / X_train.shape[0]

            self.cov_matrix += ((X_cls - class_mean).T @ (X_cls - class_mean))

        #normalize cov matrix, add smoothing
        self.cov_matrix /= X_train.shape[0]
        self.cov_matrix += self.var_smoothing * np.eye(self.num_features)

    #calculates P(X|Y) based on multidimensional gaussian, then returns class with highest prob
    #does it class by class, no broadcasting
    def predict(self, X):
        X = X.to_numpy()  # Convert to numpy array
        log_likelihoods = np.zeros((X.shape[0], self.num_classes))

        # Precompute determinant and inverse of the shared covariance matrix
        det_cov_matrix = np.linalg.det(self.cov_matrix)
        inv_cov_matrix = np.linalg.inv(self.cov_matrix)

        # Iterate over each class
        for i in range(self.num_classes):
            mu_cls = self.means_matrix[i]  # Mean for class i
            prior_cls = self.class_priors[i]  # Prior for class i

            # Compute the difference between X and the class mean
            diff = X - mu_cls

            # Compute the quadratic form: (X - mu_cls) @ inv_cov_matrix @ (X - mu_cls)^T
            quadratic_form = np.einsum('ij,jk,ik->i', diff, inv_cov_matrix, diff)

            # Compute log-likelihood for class i
            log_likelihood = -0.5 * quadratic_form + np.log(prior_cls)
            log_likelihoods[:, i] = log_likelihood

        # Return the class with the highest log-likelihood
        predicted_class = np.argmax(log_likelihoods, axis=1)
        return predicted_class

testing

In [8]:
my_GNB = GNB()

df = pd.read_csv("heart.csv")
X = df[['trtbps','chol']]
Y = df["output"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
my_GNB.fit(X_train,Y_train)

Y_pred = my_GNB.predict(X_test)
accuracy = accuracy_score(Y_pred, Y_test)
print("GNB",accuracy)

my_GDA = GDA()

df = pd.read_csv("heart.csv")
X = df[['trtbps','chol']]
Y = df["output"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
my_GDA.fit(X_train,Y_train)

Y_pred = my_GDA.predict(X_test)
accuracy = accuracy_score(Y_pred, Y_test)
print("GDA",accuracy)

GNB 0.639344262295082
GDA 0.7049180327868853
