# Import Modules

In [1]:
!git clone https://github.com/m-zayan/ml_utils.git

Cloning into 'ml_utils'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 15 (delta 1), reused 11 (delta 0), pack-reused 0[K
Unpacking objects: 100% (15/15), done.


In [2]:
import sys

sys.path.insert(0, './ml_utils')

from ml_utils.requests_utils.data_request import Writer

url = 'https://datahub.io/machine-learning/spambase/r/spambase.csv'
fname = 'spambase.csv'

Writer.download_from_url(url, to_path=f'./{fname}', chunk_size=1024)

In [3]:
import numpy as np
import pandas as pd

# Used for measuring, how accurate the implementation is.
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB as sklearn_gaussian_nb 
from sklearn.metrics import f1_score as sk_f1_score

# F1 Score Metrice

In [4]:
def f1_score(y_true, y_pred, average='none'):


  if average == 'marco':

    classes = np.unique(y_true)
    tp = np.zeros((len(classes, )))
    fp = np.zeros((len(classes, ))) 
    fn = np.zeros((len(classes, )))
       
    for i in len(classes):
      
      y_i = classes[i]
      
      true = (y_true == y_i)
      pred = (pred == y_i)

      tp[i] = (true * pred).sum()
      fp[i] = ((1-true) * pred).sum()
      fn[i] = (true * (1-pred)).sum()

      
    p = (tp / (tp + fp).clip(1e-7)).mean()
    r = (tp / (tp + fn).clip(1e-7)).mean()

    f1 = 2.0 * p * r / (p + r).clip(1e-7)
  
  else:

    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()

    p = tp / (tp + fp).clip(1e-7)
    r = tp / (tp + fn).clip(1e-7)

    f1 = 2.0 * p * r / (p + r).clip(1e-7)
  
  return f1

# Model Class

In [5]:
class GaussianNB:
  
  def __init__(self):

    self.x = None
    self.y = None
    self.params = None
  
  def __initialization__(self, x , y):

    classes, count = np.unique(y, return_counts=True)
    
    n_samples, n_features = x.shape
    n_classes = len(classes)

    mean = np.zeros((n_classes,n_features))
    sigma = np.zeros((n_classes, n_features))

    prior = count / n_samples

    self.params = {'mean':mean,
                  'sigma':sigma,
                  'prior':prior, 
                  'classes':classes , 
                  'count':count,
                  'n_classes':n_classes,
                  'n_samples':n_samples}
  
  def fit(self, x, y):

    self.__initialization__(x, y)
    
    mean  = self.params['mean']
    sigma = self.params['sigma']

    prior = self.params['prior']

    classes   = self.params['classes']

    n_samples = self.params['n_samples']
    n_classes = self.params['n_classes']

    for i in range(n_classes):
      
      y_i = classes[i]
      x_i = x[y == y_i]

      mean[i] = x_i.mean(axis=0)
      sigma[i] = x_i.var(axis=0)

    self.params['mean'] = mean
    self.params['sigma'] = sigma

    self.x = x
    self.y = y

    print(('prior :',prior, 'classes' ,classes, 'n_samples:',n_samples))
  
  def __joint_log_likelihood__(self, X):
  
    joint_log_likelihood = []
    
    classes = self.params['classes']
    
    prior = self.params['prior']
    mean = self.params['mean']
    sigma = self.params['sigma']
    
    for i in range(len(classes)):

      joint_i = np.log(prior[i])
      n_ij = -0.5 * np.sum(np.log(2. * np.pi * sigma[i, :]))
      n_ij -= 0.5 * np.sum(((X - mean[i, :]) **2 ) / sigma[i, :], axis=1)
      
      joint_log_likelihood.append(joint_i + n_ij)

    joint_log_likelihood = np.array(joint_log_likelihood).T

    return joint_log_likelihood
  

  def predict(self, x_test):
  
    classes = self.params['classes']
    jll = self.__joint_log_likelihood__(x_test)
    
    return classes[np.argmax(jll, axis=1)]

# Split dataset to Train set and Test set

In [6]:
spam_data = pd.read_csv('./spambase.csv')

X, Y = spam_data.drop(['class'], axis=1).to_numpy() , spam_data['class'].to_numpy()

x_train ,x_test ,y_train, y_test = train_test_split(X, Y, test_size=0.2 , random_state = 42)

print(x_train.shape)
print(x_test.shape)

(3680, 57)
(921, 57)


# Test Model Implementation

In [7]:
gnb = GaussianNB()

gnb.fit(x_train, y_train)

('prior :', array([0.61331522, 0.38668478]), 'classes', array([0, 1]), 'n_samples:', 3680)


In [8]:
prediction = gnb.predict(x_test)

print('F1 Score :', f1_score(y_test, prediction))
print('Sklearn F1 Score :' , sk_f1_score(y_test, prediction))

F1 Score : 0.8169642857142857
Sklearn F1 Score : 0.8169642857142857


# Using Sklearn

In [9]:
s_gnb = sklearn_gaussian_nb()

s_gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [10]:
prediction = s_gnb.predict(x_test)

print('F1 Score :', f1_score(y_test, prediction))
print('Sklearn F1 Score :' , sk_f1_score(y_test, prediction))

F1 Score : 0.8172757475083057
Sklearn F1 Score : 0.8172757475083057
