In [1]:
import numpy as np

from dataclasses import dataclass


@dataclass
class GaussianNaiveBayes:
    features: np.ndarray
    labels: np.ndarray

    def __post_init__(self) -> None:
        """Initializes additional variables for the Gaussian Naive Bayes model."""

        self.unique_labels = np.unique(self.labels)

    def fit(self) -> None:
        """Fits the Gaussian Naive Bayes model."""

        self.params = []
        # Calculate the mean and variance of each feature for each label
        for label in self.unique_labels:
            label_features = self.features[self.labels == label]
            self.params.append([(col.mean(), col.var()) for col in label_features.T])

    def likelihood(self, data: float, mean: float, var: float) -> float:
        """Calculates the Gaussian likelihood of the data with the given mean and variance."""

        # NOTE: Added in denominator to prevent division by zero
        eps = 1e-4

        coeff = 1 / np.sqrt(2 * np.pi * var + eps)
        exponent = np.exp(-((data - mean) ** 2 / (2 * var + eps)))

        return coeff * exponent

    def predict(self, features: np.ndarray) -> np.ndarray:
        """Performs inference using Bayes Rule P(Y | X) = P(X | Y) * P(Y) / P(X)."""

        predictions = []
        for feature in features:
            posteriors = []
            for idx, label in enumerate(self.unique_labels):
                prior = np.mean(self.labels == label)
                # Naive assumption (independence):
                #   P(x1, x2, x3 | Y) = P(x1 | Y) * P(x2 | Y) * P(x3 | Y)
                likelihood = np.prod(
                    [self.likelihood(fv, m, v) for fv, (m, v) in zip(feature, self.params[idx])]
                )
                # Posterior = Prior * Likelihood / Scaling Factor (ignoring scaling factor)
                posteriors.append(prior * likelihood)

            # Store the label with the largest posterior probability
            predictions.append(self.unique_labels[np.argmax(posteriors)])

        return np.array(predictions)

In [2]:
from sklearn.datasets import load_iris
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


features, labels = load_iris(return_X_y=True)
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.5, random_state=0
)

gnb = GaussianNaiveBayes(train_features, train_labels)
gnb.fit()
predictions = gnb.predict(test_features)

precision, recall, fscore, _ = precision_recall_fscore_support(
    test_labels, predictions, average="macro"
)

print(f"Precision: {precision:.3f}")
print(f"Precision: {recall:.3f}")
print(f"Precision: {fscore:.3f}")
print()
print(f"Mislabeled points: {(predictions != test_labels).sum()}/{test_features.shape[0]}")

Precision: 0.961
Precision: 0.944
Precision: 0.949

Mislabeled points: 4/75
