In [166]:
import numpy as np
import math

class GaussianNaiveBayes:
      
    def __init__(self):
        self.mean = None
        self.std = None
        self.probs = None
      
    def fit(self, X, y):
        values, counts = np.unique(y, return_counts=True)
        
        self.probs = np.zeros(len(values))
        self.mean = np.zeros((len(values), X.shape[1]))
        self.std = np.zeros((len(values), X.shape[1]))
        
        for i, value in enumerate(values):
            self.probs[value] = counts[i]/len(y)

        for label in range(len(self.probs)):
            indices = np.where(y == label)[0]
            relevant_samples = X[indices]
            for feat in range(relevant_samples.shape[1]):
                feat_mean = np.mean(relevant_samples[:, feat])
                feat_std = np.std(relevant_samples[:, feat])
              
                self.mean[label][feat] = feat_mean
                self.std[label][feat] = feat_std       
        return self
      
    def predict(self, X):
        labels = []
        for x in X:
            selected_k = -1
            max_prob_for_k = 0
            for k in range(len(self.probs)):
                sum = 0
                for i in range(self.mean.shape[1]):
                    prob_xi_ck = 1/math.sqrt(2*math.pi*self.std[k][i])*math.exp(-((x[i]-self.mean[k][i])**2)/(2*self.std[k][i]))
                    sum += prob_xi_ck
                prob_k = math.log(self.probs[k]) + sum
                if(max_prob_for_k < prob_k):
                    max_prob_for_k = prob_k
                    selected_k = k
            labels.append(selected_k)

        return labels

In [181]:
from sklearn.datasets import load_iris
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

my_gnb = GaussianNaiveBayes()
my_gnb.fit(X_train, y_train)
y_pred = my_gnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

gnb = GaussianNB()

gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')


avg-values -> precision: 0.9666666666666667, recall: 0.9666666666666667, f1 score: 0.9666666666666667
avg-values -> precision: 0.9506159420289855, recall: 0.95, f1 score: 0.9499457994579946


In [24]:
from sklearn.datasets import fetch_20newsgroups

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Get the data and target
X = newsgroups.data  # List of documents (text data)
y = newsgroups.target  # Corresponding labels (newsgroup categories)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [26]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the Breast Cancer Wisconsin dataset
cancer = load_breast_cancer()

# Get the features and target
X = cancer.data  # Feature matrix
y = cancer.target  # Target labels (0 = malignant, 1 = benign)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)