## Naive Bayes algorithm implementation with multinomial density function from scratch.

## Process:

1.   Load the dataset.
2. Split the dataset into ten equal parts. These will be used for the ten iterations of the 10-fold cross validation.

3. For each iteration of the cross validation, select one of the ten parts to be the test set and the remaining nine parts to be the training set.

4. For each class (hobby) in the dataset, calculate the prior probability by dividing the number of instances in the training set that belong to that class by the total number of instances in the training set.

5. For each feature (age, education, and job status), calculate the likelihood of that feature given each class by counting the number of instances in the training set that belong to that class and have that feature, and dividing by the total number of instances in the training set that belong to that class.

6. Use Bayes' theorem to calculate the posterior probability of each class given the features in the test set.

7. Classify each instance in the test set as the class with the highest posterior probability.

8. Calculate the accuracy of the classification by comparing the predicted class to the actual class.

9. Repeat steps 3-8 for each iteration of the cross validation, and average the accuracy across all iterations to get the final accuracy score.

### Hayes-Roth Dataset

In [None]:
import numpy as np
from sklearn.model_selection import KFold

# Load the Hayes-Roth dataset
data = np.loadtxt('/content/hayes-roth.data', delimiter=',')

# Split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# Define the number of folds for cross-validation
k = 10

# Define the prior probabilities of each class
classes, counts = np.unique(y, return_counts=True)
priors = counts / len(y)

# Define the number of features and classes
n_features = X.shape[1]
n_classes = len(classes)

# Define the parameters of the multinomial distribution for each feature and class
parameters = np.zeros((n_classes, n_features))

# Split the data into k folds for cross-validation
kf = KFold(n_splits=k, shuffle=True)

# Iterate over the folds
scores = []
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Estimate the parameters of the multinomial distribution for each feature and class using the training data
    for i, c in enumerate(classes):
        X_c = X_train[y_train == c]
        parameters[i] = (X_c.sum(axis=0) + 1) / (X_c.sum() + n_features)

    # Compute the log probabilities of each class given the test data
    log_probs = np.zeros((len(test_index), n_classes))
    for i, x in enumerate(X_test):
        for j, c in enumerate(classes):
            log_prob = np.log(priors[j])
            for k in range(n_features):
                log_prob += x[k] * np.log(parameters[j, k])
            log_probs[i, j] = log_prob

    # Predict the class with the highest log probability
    y_pred = classes[np.argmax(log_probs, axis=1)]

    # Compute the accuracy of the predictions
    accuracy = np.mean(y_pred == y_test)
    scores.append(accuracy)

# Print the average accuracy over all the folds
print('Average accuracy: ', (sum(scores)/len(scores))*100)


Average accuracy:  44.010989010989015


### car evaluation dataset

In [None]:
import numpy as np

# Load the car evaluation dataset
data = np.genfromtxt('car.data', delimiter=',', dtype=str)

# Define the number of folds for cross-validation
k = 10

# Split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# Define the prior probabilities of each class
classes, counts = np.unique(y, return_counts=True)
priors = counts / len(y)

# Define the number of features and classes
n_features = X.shape[1]
n_classes = len(classes)

# Define the likelihood parameters for each feature and class
parameters = {}
for i in range(n_features):
    feature_values = np.unique(X[:, i])
    parameters[i] = {}
    for j, c in enumerate(classes):
        class_mask = (y == c)
        class_counts = np.zeros(len(feature_values))
        for k, value in enumerate(feature_values):
            feature_mask = (X[:, i] == value)
            class_counts[k] = np.sum(class_mask & feature_mask)
        parameters[i][j] = (class_counts + 1) / (np.sum(class_mask) + len(feature_values))

# Split the data into k folds for cross-validation
fold_indices = np.array_split(np.random.permutation(len(y)), k)

# Iterate over the folds
scores = []
for i in range(k):
    # Split the data into training and testing sets
    test_indices = fold_indices[i]
    train_indices = np.concatenate([fold_indices[j] for j in range(k) if j != i])
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    # Compute the log probabilities of each class given the test data
    log_probs = np.zeros((len(test_indices), n_classes))
    for j, c in enumerate(classes):
        class_prob = np.log(priors[j])
        for l, x in enumerate(X_test):
            feature_prob = 0
            for m in range(n_features):
                feature_values = np.unique(X_train[:, m])
                if x[m] in feature_values:
                    feature_prob += np.log(parameters[m][j][np.where(feature_values == x[m])])
                else:
                    feature_prob += np.log(1 / (np.sum(y_train == c) + len(feature_values)))
            log_probs[l, j] = class_prob + feature_prob

    # Predict the class with the highest log probability
    y_pred = classes[np.argmax(log_probs, axis=1)]

    # Compute the accuracy of the predictions
    accuracy = np.mean(y_pred == y_test)
    scores.append(accuracy)

# Print the average accuracy over all the folds
print('Average accuracy:', np.mean(scores)*100)


Average accuracy: 87.15277777777779


### Breast cancer dataset

In [None]:
import numpy as np

# Load the breast cancer dataset
data = np.genfromtxt('breast-cancer.csv', delimiter=',', skip_header=True)

# Define the number of folds for cross-validation
k = 10

# Split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# Define the prior probabilities of each class
classes, counts = np.unique(y, return_counts=True)
priors = counts / len(y)

# Define the number of features and classes
n_features = X.shape[1]
n_classes = len(classes)

# Define the mean and standard deviation of each feature for each class
means = np.zeros((n_features, n_classes))
stds = np.zeros((n_features, n_classes))
for i, c in enumerate(classes):
    class_mask = (y == c)
    means[:, i] = np.mean(X[class_mask, :], axis=0)
    stds[:, i] = np.std(X[class_mask, :], axis=0)

# Split the data into k folds for cross-validation
fold_indices = np.array_split(np.random.permutation(len(y)), k)

# Iterate over the folds
scores = []
for i in range(k):
    # Split the data into training and testing sets
    test_indices = fold_indices[i]
    train_indices = np.concatenate([fold_indices[j] for j in range(k) if j != i])
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    # Compute the log probabilities of each class given the test data
    log_probs = np.zeros((len(test_indices), n_classes))
    for j, c in enumerate(classes):
        class_prob = np.log(priors[j])
        for l, x in enumerate(X_test):
            feature_prob = 0
            for m in range(n_features):
                if stds[m, j] == 0:
                    # If the standard deviation is 0, assume a small value instead
                    feature_prob += np.log(np.exp(-0.5 * ((x[m] - means[m, j]) ** 2)) / 0.0001)
                else:
                    feature_prob += np.log(np.exp(-0.5 * ((x[m] - means[m, j]) ** 2) / (stds[m, j] ** 2)) / (np.sqrt(2 * np.pi) * stds[m, j]))
            log_probs[l, j] = class_prob + feature_prob

    # Predict the class with the highest log probability
    y_pred = classes[np.argmax(log_probs, axis=1)]

    # Compute the accuracy of the predictions
    accuracy = np.mean(y_pred == y_test)
    scores.append(accuracy)

# Print the average accuracy over all the folds
print('Average accuracy:', np.mean(scores)*100)


  feature_prob += np.log(np.exp(-0.5 * ((x[m] - means[m, j]) ** 2)) / 0.0001)
  feature_prob += np.log(np.exp(-0.5 * ((x[m] - means[m, j]) ** 2) / (stds[m, j] ** 2)) / (np.sqrt(2 * np.pi) * stds[m, j]))


Average accuracy: 0.17543859649122806


### References:
1. https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
2. https://machinelearningmastery.com/k-fold-cross-validation/

Datasets:
- Hayes-Roth Dataset (https://archive.ics.uci.edu/ml/datasets/Hayes-Roth)
- Car Evaluation Dataset (https://archive.ics.uci.edu/ml/datasets/Car+Evaluation)
- Breast Cancer Dataset (https://archive.ics.uci.edu/ml/datasets/Breast+Cancer)