In [5]:
import numpy as np
import math

class GaussianNaiveBayes:
      
    def __init__(self):
        self.mean = None
        self.std = None
        self.probs = None
      
    def fit(self, X, y):
        values, counts = np.unique(y, return_counts=True)
        
        self.probs = np.zeros(len(values))
        self.mean = np.zeros((len(values), X.shape[1]))
        self.std = np.zeros((len(values), X.shape[1]))
        
        for i, value in enumerate(values):
            self.probs[value] = counts[i]/len(y)

        for label in range(len(self.probs)):
            indices = np.where(y == label)[0]
            relevant_samples = X[indices]
            for feat in range(relevant_samples.shape[1]):
                feat_mean = np.mean(relevant_samples[:, feat])
                feat_std = np.std(relevant_samples[:, feat])
              
                self.mean[label][feat] = feat_mean
                self.std[label][feat] = feat_std       
        return self
      
    def predict(self, X):
        labels = []
        for x in X:
            selected_k = -1
            max_prob_for_k = 0
            for k in range(len(self.probs)):
                sum = 0
                for i in range(self.mean.shape[1]):
                    prob_xi_ck = 1/math.sqrt(2*math.pi*self.std[k][i])*math.exp(-((x[i]-self.mean[k][i])**2)/(2*self.std[k][i]))
                    sum += prob_xi_ck
                prob_k = math.log(self.probs[k]) + sum
                if(max_prob_for_k < prob_k):
                    max_prob_for_k = prob_k
                    selected_k = k
            labels.append(selected_k)

        return labels

In [6]:
from sklearn.datasets import load_iris
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

my_gnb = GaussianNaiveBayes()
my_gnb.fit(X_train, y_train)
y_pred = my_gnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

gnb = GaussianNB()

gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')


avg-values -> precision: 0.946031746031746, recall: 0.9333333333333333, f1 score: 0.9333333333333333
avg-values -> precision: 0.9575, recall: 0.95, f1 score: 0.9501039501039501


In [7]:
# class BernoulliNaiveBayes:
      
#     def __init__(self):
#         self.probs = None
#         self.feats = None
      
#     def fit(self, X, y):
#         values, counts = np.unique(y, return_counts=True)
#         self.probs = counts/len(y)
        
#         self.feats = np.zeros((len(self.probs), X.shape[1]))
#         for i, k in enumerate(values):
#             indices = np.where(y == k)
#             relevant_samples = X[indices]
#             self.feats[i, :] = (np.sum(relevant_samples, axis=0) + 1) / (relevant_samples.shape[0] + 2)

        
#         return self
      
#     def predict(self, X):
#         labels = []

#         for x in X:
#             x = x.toarray().flatten()
#             selected_k = -1
#             max_prob_for_k = 0
#             for k in range(len(self.probs)):
#                 sum = 0
#                 for i in range(x.shape[0]):
#                     prob_xi_ck = self.feats[k][i]**x[i]*(1-self.feats[k][i])**(1-x[i])
#                     sum += prob_xi_ck
#                 prob_k = math.log(self.probs[k]) + sum
#                 if(max_prob_for_k < prob_k):
#                     max_prob_for_k = prob_k
#                     selected_k = k
#             labels.append(selected_k)

#         return labels

In [80]:
import numpy as np

class BernoulliNaiveBayes:
    
    def __init__(self):
        self.probs = None
        self.feats = None
    
    def fit(self, X, y):
        values, counts = np.unique(y, return_counts=True)
        self.probs = counts / len(y)
        self.feats = np.zeros((len(self.probs), X.shape[1]))

        for i, k in enumerate(values):
            indices = np.where(y == k)
            relevant_samples = X[indices]
            
            sum = (np.sum(relevant_samples, axis=0) + 1) # +1 for Laplace smoothing
            self.feats[i, :] = sum / (relevant_samples.shape[0] + 2) # +2 for Laplace smoothing
        
        return self
    
    def predict(self, X, calc_logs=True):
        labels = []

        X = X.toarray().flatten()
        for x in X:
            # x = x_sparse.toarray().flatten()
            selected_k = -1
            max_prob_for_k = -1

            for k in range(len(self.probs)):

                prob_k = self.calculate_prob_logs(k, x) if calc_logs else self.calculate_prob_directly(k, x)

                if prob_k > max_prob_for_k:
                    max_prob_for_k = prob_k
                    selected_k = k

            labels.append(selected_k)

        return labels
    
    def calculate_prob_directly(self, k, x):
        prob_xi_ck = self.feats[k] ** x * (1 - self.feats[k]) ** (1 - x)
        prob_k = self.probs[k] * np.prod(prob_xi_ck)
        return prob_k
    
    def calculate_prob_logs(self, k, x):
        log_likelihood = x * np.log(self.feats[k]) + (1 - x) * np.log(1 - self.feats[k])  
        log_prob_k = np.log(self.probs[k]) + np.sum(log_likelihood)
        return log_prob_k



In [50]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

X_data = newsgroups.data
y = newsgroups.target

cnt_vect = CountVectorizer()
cnt_vect.fit(X_data)
X = cnt_vect.transform(X_data)
X.data[X.data>1] = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [58]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

avg-values -> precision: 0.6604541322202436, recall: 0.4917771883289125, f1 score: 0.46683201136660324


In [61]:
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

avg-values -> precision: 0.6604541322202436, recall: 0.4917771883289125, f1 score: 0.46683201136660324


In [73]:
bnb = BernoulliNaiveBayes()

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test, calc_logs=False)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

avg-values -> precision: 0.5756197639939095, recall: 0.39283819628647215, f1 score: 0.3815098191772453


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
bnb = BernoulliNaiveBayes()

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

In [None]:
import numpy as np

class BernoulliNaiveBayesGoodLogs:
      
    def __init__(self):
        self.class_log_prior_ = None
        self.feature_log_prob_ = None
      
    def fit(self, X, y):
        # Get the unique class values and their counts
        values, counts = np.unique(y, return_counts=True)
        
        # Compute the log of class priors
        self.class_log_prior_ = np.log(counts / len(y))  # Log of prior probabilities
        
        # Initialize feature log probabilities
        self.feature_log_prob_ = np.zeros((len(self.class_log_prior_), X.shape[1]))

        for i, k in enumerate(values):
            indices = np.where(y == k)
            relevant_samples = X[indices]
            
            # Laplace smoothing: Add 1 to feature count, add 2 to denominator for smoothing
            smoothed_counts = (np.sum(relevant_samples, axis=0) + 1)
            self.feature_log_prob_[i, :] = np.log(smoothed_counts / (relevant_samples.shape[0] + 2))
        
        return self
      
    def predict(self, X):
        labels = []

        for x in X:
            x_dense = x.toarray().flatten()  # Convert to dense array
            selected_k = -1
            max_log_prob = -np.inf  # Use negative infinity for log probabilities

            for k in range(len(self.class_log_prior_)):
                # Calculate log likelihood for each class k
                log_likelihood = x_dense * self.feature_log_prob_[k] + (1 - x_dense) * np.log(1 - np.exp(self.feature_log_prob_[k]))
                
                # Sum log likelihood and class prior (log of prior probability)
                log_prob_k = self.class_log_prior_[k] + np.sum(log_likelihood)

                # Select the class with the highest log probability
                if log_prob_k > max_log_prob:
                    max_log_prob = log_prob_k
                    selected_k = k

            labels.append(selected_k)

        return labels

    

In [None]:
bnbgl = BernoulliNaiveBayesGoodLogs()

bnbgl.fit(X_train, y_train)
y_pred = bnbgl.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

In [None]:

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test, calc_logs=False)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the Breast Cancer Wisconsin dataset
cancer = load_breast_cancer()

# Get the features and target
X = cancer.data  # Feature matrix
y = cancer.target  # Target labels (0 = malignant, 1 = benign)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

my_gnb = GaussianNaiveBayes()
my_gnb.fit(X_train, y_train)
y_pred = my_gnb.predict(X_test)
precision_avg, recall_avg, f1_score_avg, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'avg-values -> precision: {precision_avg}, recall: {recall_avg}, f1 score: {f1_score_avg}')