Gaussian Discriminant Analysis

Learned
- Use numpy matrix/vector operations to replace iterating
- Without broadcasting, easier to understand


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
class GDA:
    def __init__(self):
        return
    
    def fit(self, X, Y):
        #convert X, Y df's to numpy arrays
        self.X_train = X.to_numpy()
        self.Y_train = Y.to_numpy()

        #identify classes and #
        self.classes = np.unique(self.Y_train)
        self.num_classes = len(np.unique(self.Y_train))
        self.num_features = self.X_train.shape[1]

        #initialize means, vars, and priors into matrix
        self.means_matrix = np.zeros((self.num_classes,self.num_features))
        self.cov_matrix = np.zeros(self.num_features,self.num_features)
        self.class_priors = np.zeros(self.num_classes)

        for i, cls in enumerate(self.classes):
            #for each class, take input data
            X_cls = X[Y == cls]
            #calculate mean for input vectors across class
            class_mean = np.mean(X_cls, axis=0)
            #input into means matrix
            self.means_matrix[i, :] = class_mean
            #add class into cov matrix
            self.cov_matrix += ((X_cls - class_mean).T) @ (X_cls - class_mean)

        #normalize
        self.cov_matrix /= len(self.X_train)

    def predict(self,X):
        self.X_pred = X.to_numpy()
        log_likelihoods = np.zeros(self.num_classes)

        #determinant of cov matrix
        det_cov_matrix = np.linalg.det(self.cov_matrix)
    
        #the inverse of the covariance matrix
        inv_cov_matrix = np.linalg.inv(self.cov_matrix)

        for i in range(self.num_classes):
            mu_cls = self.means_matrix[i]
            prior_cls = self.class_priors[i]
            
            #compute the difference between the data point and the class mean
            diff = X - mu_cls
            
            #compute the quadratic form
            quadratic_form = diff.T @ inv_cov_matrix @ diff
            
            #compute the log-likelihood
            log_likelihood = -0.5 * np.log(det_cov_matrix) - 0.5 * quadratic_form + np.log(prior_cls)
            
            #store the log-likelihood for this class
            log_likelihoods[i] = log_likelihood

        #predict the class with the highest log-likelihood, return
        predicted_class = np.argmax(log_likelihoods)
        return predicted_class

        


    
