# Naive Bayes from scratch

Problem:  Implement a spam filter based on the Naive Bayes classifier.

This implementation features: 
 - data_loader for the data format (also can split data into train and test).
 - naive_bayes algorithm which classifies the test data and calculates metrics
 - not optimized code. I rather focuses on step by step calculation to maximize understandability
 
Note: Throughout this implementation, Spam is labeled with 1 and Ham is labeled with 0 

In [119]:
import numpy as np 
import pandas as pn
import os
import math as m
import pandas as pd

In [8]:
class data_tansformer:                  # data loader which load all the data into one matrix 
    def __init__(self,path):
        self.path = path
        self.data = self.open_files()
        
    def open_files(self):               #opens all the files and combines them
        documents = []
        titles = []
        for filename in os.listdir(self.path):
            for file in os.listdir(self.path + "/" + filename):
                titles.append(str(file))
                data = open(self.path + "/" + filename + "/" + str(file),"r+")
                data = data.read()
                data = data.replace("\n"," ")
                data = data.replace("Subject:","")
                data = data.split(" ")
                num = []
                for x in range (len(data)):
                    if data[x] == "":
                        pass
                    else: 
                        num.append(int(data[x]))
                documents.append(num)
            numL = []
            for y in titles: 
                if "spmsg"  in y:
                    numL.append(1)
                elif "legit" in y: 
                    numL.append(0)
                    
        documents = np.expand_dims(np.array(documents),axis= 1)
        numL = np.expand_dims(np.array(numL),axis= 1)
        return np.concatenate((documents,numL),axis = 1)
    
    def split(self, percent = 0.3):              #splits the data into train and test based on a percentage
        data = np.random.shuffle(self.data)
        test = self.data[:int(0.3*len(self.data))]
        train = self.data[int(0.3*len(self.data)):]
        return train,test       

In [164]:
class naive_bayes:               #main naive bayes classifier class
    def __init__(self):
        pass
        
    def split_classes(self,data):
        spam = []
        ham = []
        for x in data:
            if x[1] == 1:
                spam.append(x)
            elif x[1] == 0:
                ham.append(x)
        return np.array(spam), np.array(ham)
    
    def create_word_dict(self):                #counts the word occurences for both classes
        spamDic = {}
        hamDic = {}
        spamCount = 0 
        hamCount = 0
        spamV = 0
        hamV = 0 
        for x in range (len(self.spam)): 
            for y in range(len(self.spam[x][0])):
                spamCount +=1 
                if self.spam[x][0][y] in spamDic: 
                    spamDic[self.spam[x][0][y]] +=1
                else: 
                    spamV +=1
                    spamDic[self.spam[x][0][y]] =1
        for x in range (len(self.ham)): 
            for y in range(len(self.ham[x][0])):
                hamCount +=1 
                if self.ham[x][0][y] in hamDic: 
                    hamDic[self.ham[x][0][y]] +=1
                else: 
                    hamV +=1
                    hamDic[self.ham[x][0][y]] =1
        return spamDic,hamDic,np.array([[hamV,hamCount],[spamV,spamCount]])
    
    def fit(self,data):             #return some basic values for probability calculation
        self.spam, self.ham = self.split_classes(data)
        self.n = np.array([len(self.ham),len(self.spam),len(data)])
        self.spamDic, self.hamDic, self.counts = self.create_word_dict()
               
    def predict(self,test) :   # predicts test data based on the fitted data
        self.testData = test
        predictions = []
        
        hamP = []
        pY = m.log(self.n[0]/self.n[2])
        for x in test: 
            pyx = pY
            for y in x[0]: 
                if y in self.hamDic:
                    pyx += m.log((self.hamDic[y]+1)/(self.counts[0][1]+self.counts[0][0]+1))
                else: 
                    pyx += m.log((0+1)/(self.counts[0][1]+self.counts[0][0]+1))
            hamP.append(pyx)
        
        spamP = []
        pY = m.log(self.n[1]/self.n[2])
        for x in test: 
            pyx = pY
            for y in x[0]: 
                if y in self.spamDic:
                    pyx += m.log((self.spamDic[y]+1)/(self.counts[1][1]+self.counts[1][0]+1))
                else: 
                    pyx += m.log((0+1)/(self.counts[1][1]+self.counts[1][0]+1))
            spamP.append(pyx)
            
        for z in range(len(spamP)):
            if spamP[z] >= hamP[z]:
                predictions.append(1)
            else:
                predictions.append(0)     
        self.predictions = predictions
        print("The test data was predicted based on the model.")
    
    def accuracy(self):                                            # 3 basic metrics to ecaluate the model
        accuracy = 0
        for x in range(len(self.predictions)): 
            if self.predictions[x] == self.testData[x][1]:
                accuracy +=1
        return accuracy/len(self.predictions)*100
    
    def confusion_matrix(self):
        conf = np.zeros((2,2), dtype=int)
        for x in range(len(self.predictions)): 
            conf[self.predictions[x]][self.testData[x][1]] += 1
        return conf
    
    def F1(self,conf):
        precH = conf[0][0] / (conf[0][0] + conf[0][1])
        recH = conf[0][0] / (conf[0][0] + conf[1][0])
        precS = conf[1][1] / (conf[1][1] + conf[1][0])
        recS = conf[1][1] / (conf[1][1] + conf[0][1])
        F1H = (2*precH*recH) / (precH+recS)
        F1S = (2*precS*recS) / (precH+recS)
        return [F1H,F1S]
        
    def evaluate(self):                 # Calculates some basic metrics based on the results of the test data
        print("The following values results were achieved for the test data:"+ "\n")
        accuracy = self.accuracy()
        print("Accuracy: " + str(accuracy) + "%" +"\n")
        confusion = self.confusion_matrix()
        print("Confusion Maxtrix:")
        print(pd.DataFrame(data=confusion,index=["Spam","Ham"],columns=["Spam","Ham"]),"\n")
        F1 = self.F1(confusion)
        print("Fscores for both classes:",pd.DataFrame(data=F1,index=["Spam","Ham"],columns=[""]))   

In [165]:
prep = data_tansformer("data/Bayes")
train, test = prep.split()
bayes = naive_bayes()
bayes.fit(train)
bayes.predict(test)
bayes.evaluate()

    
        
        

The test data was predicted based on the model.
The following values results were achieved for the test data:

Accuracy: 94.4954128440367%

Confusion Maxtrix:
      Spam  Ham
Spam   168    3
Ham     15  141 

Fscores for both classes:               
Spam  0.919572
Ham   0.902330
