In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing necessary libraries


In [None]:
import numpy as np
import pandas as pd
import tracemalloc
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

## Algorithm Directory path

In [None]:
# All the datasets are loaded from this directory and
# All the results are stored in this directory
directoryPath = '/content/drive/MyDrive/Colab Notebooks/Data Mining Assignments/Classification/'

## Naive Bayes Classfier

In [None]:
class NaiveBayesClassifier:

  def __init__(self, X, Y, discreteThreshold = 15, eps = 1e-6):
    self.X = X
    self.Y = Y
    self.discreteThreshold = discreteThreshold
    self.eps = eps
    self.attributeInfo = self.get_attribute_info()

  def normal_PDF(self, val, mu, sigma):
    sigma = sigma if sigma != 0 else self.eps 
    exponentTerm = (-1) * ( ( (val-mu) ** 2 ) / ( 2 * (sigma ** 2) ) )
    return (1/(np.sqrt(2*np.pi) * sigma)) * np.exp(exponentTerm)

  def get_attribute_info(self):
    attributeInfo = []
    distinctClasses, classCounts = np.unique(self.Y, return_counts=True)
    for i in range(self.X.shape[1]):
      column = self.X[:, i]
      distinctValues = np.unique(column)
      attributeType = 'discrete' if len(distinctValues) <= self.discreteThreshold else 'continuous'
      classWiseMean = {}
      classWiseStd = {}
      if attributeType == 'continuous':
        for cls in distinctClasses:
          classWiseMean[cls] = np.mean(column[self.Y == cls])
          classWiseStd[cls] = np.std(column[self.Y == cls])

      attributeInfo.append({
          'idx' : i,
          'type' : attributeType,
          'distinctValues' : distinctValues if attributeType == 'discrete' else None,
          'classWiseMean' : classWiseMean,
          'classWiseStd' : classWiseStd
      })

    return np.array(attributeInfo)

  def predict_one(self, x):
    distinctClasses, classCounts = np.unique(self.Y, return_counts=True)
    classProbs = classCounts/np.sum(classCounts)
    maxPosterior = -np.inf
    winClass = None
    for i in range(len(distinctClasses)):
      likelihood = 0
      for j in range(len(self.attributeInfo)):
        if self.attributeInfo[j]['type'] == 'discrete':
          column = self.X[:, j]
          classCorresporendingValues = column[self.Y == distinctClasses[i]]
          conditionalProb = (classCorresporendingValues == x[j]).sum()/classCounts[i]
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

        else:
          conditionalProb = self.normal_PDF(x[j], self.attributeInfo[j]['classWiseMean'][distinctClasses[i]], self.attributeInfo[j]['classWiseStd'][distinctClasses[i]])
          conditionalProb = conditionalProb if conditionalProb != 0 else self.eps
          likelihood += np.log(conditionalProb)
          # print("attribute : {} and class : {} and Prob : {}".format(j, distinctClasses[i], conditionalProb))

      # print("Class {} -> Likelihood {}".format(distinctClasses[i], likelihood))
      posterior = likelihood + np.log(classProbs[i]) 
      # print("Class {} -> Posterior {}".format(distinctClasses[i], posterior))
      if posterior >= maxPosterior:
        maxPosterior = posterior
        winClass = distinctClasses[i]
    
    # print("\nWinclass: {}\n".format(winClass))
    return winClass 

  def predict(self, XTest):
    YPred = []
    for x in XTest:
      YPred.append(self.predict_one(x))
    return np.array(YPred)  

##Running on Sample datasets


In [None]:
# filePath = directoryPath + 'sampleDataset-1.csv'
# df = pd.read_csv(filePath, sep=",", header=None)
# dfX = df.iloc[:,:-1]
# dfY = df.iloc[:,-1]
# # print(dfX.head())
# # print(dfY.head())

# X = dfX.to_numpy()
# Y = np.squeeze(dfY.to_numpy())
# # print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# naiveBayesClassifier = NaiveBayesClassifier(X, Y, discreteThreshold = 15)
# YPred = naiveBayesClassifier.predict(X)
# print(Y)
# print(YPred)
# print((Y == YPred).sum())

In [None]:
# filePath = directoryPath + 'sampleDataset-2.csv'
# df = pd.read_csv(filePath, sep=",", header=None)
# dfX = df.iloc[:,:-1]
# dfY = df.iloc[:,-1]
# # print(dfX.head())
# # print(dfY.head())

# X = dfX.to_numpy()
# Y = np.squeeze(dfY.to_numpy())
# # print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# naiveBayesClassifier = NaiveBayesClassifier(X, Y, discreteThreshold = 15)
# YPred = naiveBayesClassifier.predict(X)
# print(Y)
# print(YPred)
# print((Y == YPred).sum())

In [None]:
# from sklearn.datasets import load_iris
# X, Y = load_iris(return_X_y= True)
# Y = np.squeeze(Y)
# # print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# naiveBayesClassifier = NaiveBayesClassifier(X, Y, discreteThreshold = 15)
# # print(naiveBayesClassifier.get_attribute_info())
# YPred = naiveBayesClassifier.predict(X)
# print(Y)
# print(YPred)
# print((Y == YPred).sum())
# print(np.arange(len(Y))[Y!=YPred])

In [None]:
# from sklearn.datasets import load_digits
# X, Y = load_digits(return_X_y= True)
# Y = np.squeeze(Y)
# # print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# naiveBayesClassifier = NaiveBayesClassifier(X, Y, discreteThreshold = 15)
# # print(naiveBayesClassifier.get_attribute_info())
# YPred = naiveBayesClassifier.predict(X)
# # print(Y)
# # print(YPred)
# print((Y == YPred).sum()/len(Y))
# # print(np.arange(len(Y))[Y!=YPred])

0.8580968280467446
