<a href="https://colab.research.google.com/github/mugekuskon/BuildingSpamClassifier/blob/main/Building_a_Spam_Classifier_with_Naive_Bayes_Muge_Kuskon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Müge Kuşkon 25425*

# ***Naïve Bayes Implementation***

In [None]:
import pandas as pd

#Check if the training data is balanced or not. 
train_features = pd.read_csv('train-features.txt', sep=" ", header=None)
train_labels = pd.read_csv('train-labels.txt', sep=" ", header=None)

vocab = 2500


In [None]:
#Show the balance between class 1 and 0. 
#The result shows us that the training data is balanced. Half.
train_labels.value_counts() #Both are 350.
probOfspam = train_labels[0].value_counts()[1]/(len(train_labels))
probOfnonspam =  1 - probOfspam

print("Py=0 is ", probOfnonspam, "\nPy=1 is ", probOfspam)


Py=0 is  0.5 
Py=1 is  0.5


 **Traning a Naive Bayes Classifier**

In [None]:
#Train it on the training data and test the classifier on the test data.
#Report test accuracy and how many predictions were made.
#MLE estimator should be used. When tie predict nonspam. 

import numpy as np

#print(train_labels[0]) #To check whether the mail belongs to a spam or not. 


In [None]:
#Sum of each mail, each rows according to their index.
sumForEachMail = [0]*700 #size of the email numbers.

for x in range(700): #700.
  for y in range(vocab):#all the vocabulary.
     sumForEachMail[x] += int(train_features[y][x])

In [None]:
#Functions to be used:
import math

#TRAINING PART:

def calculateNumofWords(sumForEachMail, train_labels):
  numofSpam = 0
  numofNonSpam = 0
  for s in range(len(sumForEachMail)):
    if train_labels[0][s] == 0:#nonspam
      numofNonSpam += sumForEachMail[s]
    if train_labels[0][s] == 1:#spam
      numofSpam += sumForEachMail[s]
  return numofSpam, numofNonSpam

def occurences(vocab, train_features, train_labels, nonspam, spam):
  for x in range(len(train_features)): #700.
    for y in range(vocab):#all the vocabulary.
      if train_labels[0][x] == 0: #belongs to the nonspam category
        nonspam[y] += train_features[y][x] #increasing the according index of the word by its occurence.
      if train_labels[0][x] == 1: #belongs to the spam category
        spam[y] += train_features[y][x]  #increasing the according index of the word by its occurence.
  return spam, nonspam

def spamEstimator(spam, numofSpam, map, total): #map and total is used for additive smoothing.
  spamEstimation=[]
  for i in spam:
    spamEstimation.append((i+map)/(numofSpam+total))
  return spamEstimation

def nonspamEstimator(nonspam, numofNonSpam, map, total): #map and total is used for additive smoothing.
  nonspamEstimation=[]
  for i in nonspam:
    nonspamEstimation.append((i+map)/(numofNonSpam+total))
  return nonspamEstimation

#TESTING PART:

def logCalculation(occ, estimation):
  result = 0
  if occ == 0 and estimation == 0:
    result = 0
  elif occ != 0 and estimation ==0:
    result = float(-math.inf)
  else:
    result = occ*math.log(estimation)
  return result

def mailPredict(test_features, vocab, mailPrediction, nonspamEstimation, spamEstimation):

  for i in range(len(test_features)):#260
    class0 = 0 #keep the sum to compare later which class is more likely. 
    class1 = 0 #spam
    for j in range(vocab):
      class0 +=  logCalculation(test_features[j][i], nonspamEstimation[j]) #number of occurences times its likelihood to be nonspam.
      class1 +=  logCalculation(test_features[j][i], spamEstimation[j])
    class0 = logCalculation(1,probOfnonspam) + class0
    class1 = logCalculation(1,probOfspam) + class1
    if class0 < class1:
      #prediction is spam.
      mailPrediction.append(1)

    else:
      #more likely to become a nonspam.
      mailPrediction.append(0)
 
  return mailPrediction



def accuracyCalculation(predict,test_labels):
  correctLabel = 0
  for p in range(len(predict)):
    if predict[p] == test_labels[0][p]:
      correctLabel += 1
  return(correctLabel/len(test_labels))

In [None]:
#Calculating number of words in spam and nonspam emails.

numofSpam, numofNonSpam = calculateNumofWords(sumForEachMail, train_labels)

#Number of words in each class
print("Number of words in Spam:", numofSpam, "\nNonSpam: ", numofNonSpam)

#Keep the occurences of words in vocab for two of the classes.
nonspam = [0]*2500
spam = [0]*2500


spam, nonspam = occurences(vocab, train_features, train_labels, nonspam, spam)     
#print(spam)

Number of words in Spam: 91566 
NonSpam:  61752


In [None]:
#Last two parameters are zero since this is not the MAP estimation.
spamEstimation = spamEstimator(spam, numofSpam, 0, 0) # (θj/y=1) Finding the likelihood of word being spam.
nonspamEstimation = nonspamEstimator(nonspam, numofNonSpam, 0, 0) # (θj/y=0) Finding the likelihood of word being nonspam.


**Testing Part:**

In [None]:

test_features = pd.read_csv('test-features.txt', sep=" ", header=None)
test_labels = pd.read_csv('test-labels.txt', sep=" ", header=None)


In [None]:
mailPrediction = [] #list where the predictions will be appended. (0 or 1).
mailPrediction = mailPredict(test_features, vocab, mailPrediction, nonspamEstimation, spamEstimation )

In [None]:
accuracy = accuracyCalculation(mailPrediction, test_labels )
print("Accuracy of the model is: ", accuracy )

Accuracy of the model is:  0.5


In [None]:
#Extending the classifier by using additive smoothing.
spamEstimatorforMAP = spamEstimator(spam, numofSpam, 1, vocab) #the last two parameters are for the additive smoothing.
nonspamEstimationforMAP = nonspamEstimator(nonspam, numofNonSpam, 1, vocab)

In [None]:
mailPredictionforMAP = []
mailPredictionforMAP = mailPredict(test_features, vocab, mailPredictionforMAP, nonspamEstimationforMAP, spamEstimatorforMAP )

In [None]:
accuracy = accuracyCalculation(mailPredictionforMAP, test_labels )
print("Accuracy of the model after additive smoothing is: ", accuracy )

Accuracy of the model after additive smoothing is:  0.9730769230769231
