In [6]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

In [8]:
# From Scratch ---------------

# NB - Class
class NB:
    def fit(self, x, y):
        x = np.asarray(x)
        y = np.asarray(y)

        # identify classes and allocate arrays
        self.classes_, counts = np.unique(y, return_counts=True)
        n_classes = len(self.classes_)
        n_features = x.shape[1]

        self.means_ = np.zeros((n_classes, n_features))
        self.vars_ = np.zeros((n_classes, n_features))
        self.priors_ = np.zeros(n_classes)

        # compute mean, variance and prior for each class
        for idx, cls in enumerate(self.classes_):
            X_c = x[y == cls]
            self.means_[idx, :] = X_c.mean(axis=0)
            self.vars_[idx, :] = X_c.var(axis=0)  # population variance
            self.priors_[idx] = X_c.shape[0] / x.shape[0]

        # numerical stability
        self.epsilon_ = 1e-9
        self.vars_ = np.where(self.vars_ <= 0, self.epsilon_, self.vars_)
        return self

    def _gaussian_log_prob(self, class_idx, x):
        # returns log p(x | class)
        mean = self.means_[class_idx]
        var = self.vars_[class_idx]
        # constant term and exponent term of multivariate (independent features) Gaussian
        const = -0.5 * np.sum(np.log(2.0 * np.pi * var))
        exponent = -0.5 * np.sum(((x - mean) ** 2) / var, axis=1)
        return const + exponent

    def predict(self, x):
        x = np.asarray(x)
        n_samples = x.shape[0]
        n_classes = len(self.classes_)
        log_probs = np.zeros((n_samples, n_classes))

        # compute log posterior for each class and sample: log p(x|c) + log p(c)
        for idx in range(n_classes):
            log_likelihood = self._gaussian_log_prob(idx, x)
            log_prior = np.log(self.priors_[idx])
            log_probs[:, idx] = log_likelihood + log_prior

        class_indices = np.argmax(log_probs, axis=1)
        return self.classes_[class_indices]



# Step 1: Load the Iris dataset (3 classes: Setosa, Versicolor, Virginica)
data = pandas.read_csv('iris.txt', names=['sepal-length', 'sepal-width','petal-length', 'petal-width', 'class'])
# We'll use all features 
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values


# Step 2: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 3: Train the model
NaiveBayes = NB()
NaiveBayes.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = NaiveBayes.predict(X_test)

# Step 5: Evaluate
accuracy = np.sum(y_test == y_pred)/len(y_test)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 1.00
