# SPAM CLASSIFIER 


### INTRODUCTION
1) This is a spam classifier which takes a SMS and predicts whether it is a spam or a ham.
2) The libraries used in the making of the project are mentioned below 
3) Short explanation of the code below:
    * Created a function "processData" to process the data that is to remove punctuations,convert to lowercase and tokenize the data.
    * Separate the data into SMStrainingSet and SMStestSet using a function SMSseparate and store both into arrays
           for example : [ ['spam', [words in it] ] ....]
    * We created a makeSMSvocab which stores all the words in a dictionary as keys and in values we have how many times the word appeared in a ham or spam which we maintain using Counter
    * We used the Binary method and created functions for prior( P( C ) ) , likelihood( P( wd | C ) ) , score
    * We also define a fuction called predictor which takes processed Data as input and tells whether it is ham or spam.
    * We then have a function which makes the confusion matrix as well as tells us the accuracy of our model
    * At last we have a function which tells the top 10 words for each class( ham or spam )


In [None]:
import math
import string
from collections import Counter

import numpy as np
import pandas as pd

### IMPORTING DATA

In [None]:
# accessing the data using pandas read function
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "text"])
df

### FUNCTIONS TO PROCESS THE DATA
    1) removePunc(text):
         takes a string as input and removes all numbers and punctuations from it returns the string
    2) lowerCase(text):
        takes a string as input and make all alphabets to lower case and returns the string
    3) tokenize(text):
        takes a string as input and separates all the words and store them in a array and returns the array   
    4) processData(text):
        takes string as input and uses all three above function to process the data (easier to use than using three functions)

In [None]:
def removePunc(text):
    newText = ""
    punc = string.punctuation  #punc contains all the punctuations and numbers which need to be removed from data
    punc += "£0123456789"
    for i in text:
        if i == "\n": # this removes the nextline and adds space in place of the that new line thus , taking care of newline characters
            i = " "
        if i not in punc:  # if the character is not in punc then it is added to the newText which is returned , thus removing undesired characters
            newText += i
    return newText


def lowerCase(text):
    return text.lower()  # in built function which converts text to lower case #

'''
what we do here is we create 'x' in which we keep storing the characters until a space appears at that time if x is not empty
then we append it in the 'arr' else we do nothing and keep going until another character appears . This way we separate the
all the words and store them in the 'arr'
'''
def tokenize(text):
    arr = []
    x = ""
    for i in text: 
        if i != " ": 
            x += i
        elif i == " " and len(x) > 0:
            arr.append(x)
            x = ""
        else:
            continue
    if len(x) > 0:
        arr.append(x)

    return arr

# In this we use all the above three functions to make it easier to process the data later by calling only one function
def processData(text):  
    text = removePunc(text)
    text = lowerCase(text)
    text = tokenize(text)
    return text

### PROCESSING THE DATA
    We process the data here and store it in procData in the form : [['ham',['i','am',..]],['spam',['free','yes'..]]....]

In [None]:
'''
using the 'loc' function of pandas where 'text' specifies the column and 'i' tells the row number so we get the data from the 
ith row and column 'text' which is nothing but SMS
'''a
procData = []
for i in range(len(df)):
    x = df.loc[i, "text"] 
    x = processData(x) # after retreiving the data it is processed using 'processData'
    procData.append([df.loc[i, "label"], x])  # here it is appended in the 'procData' in the form ['ham/spam',x(processedData)]
print(len(procData))
procData

### SEPARATING THE DATA INTO TESTSET AND TRAINING SET

In [None]:
SMStrainingSet = []
SMScount = {"ham": 0, "spam": 0}  # Here we save the count of ham documents and count of spam documents in the training Set
SMStestSet = []
'''
The procData is separated into training set and test set using the SMSseparateData which takes the percent of data that should be 
in the the training set and uses 'split' which tells the amout of files that should be in the training set and we separated them 
by using 'cnt' and using if else conditions .
'''
def SMSseparateTheData(
    DataInTraining,
):  # please give percentage of data that should be in trainingSet
    split = round((DataInTraining * len(procData)) / 100) # amout of files that should be in training set 
    cnt = 0
    for i in procData:
        cnt += 1
        if cnt <= split: # while cnt<=split put into training set
            SMScount[i[0]] += 1 # counting ham or spam in training set
            SMStrainingSet.append(i)
        else:
            SMStestSet.append(i) # while cnt>split put into test set


SMSseparateTheData(89.75)
len(SMStrainingSet)

### DEFINING VOCABULARY


In [None]:
SMSV = {}

'''
Here we create the vocabulary of the training set and store them in 'SMSV' dictionary which contains words as keys and a Counter
dictionary which saves in how many spams or hams have the words appeared in as value .
'''
def makeSMSvocab():
    for i in SMStrainingSet:
        uniqueWordsInSMS = set(i[1]) # we use set to remove duplicates and keep only unique words
        d = Counter() # Counter dictionary which stores in how many spams and hams have the word appeared in
        if i[0] == "spam":
            d["spam"] = 1
        else:
            d["ham"] = 1
        for j in uniqueWordsInSMS:
            if j not in SMSV:
                SMSV[j] = d.copy() # if the word is not in dictionary we put it there
            else:
                SMSV[j] += d # if it is already there we add the 'd' to that word


makeSMSvocab()
SMSV

## FUNCTIONS TO COMPUTE PRIOR , LIKELIHOOD AND SCORE 


### FUCNTION FOR PRIOR 

In [None]:
# cls=class
'''
The prior prabability of a class is nothing but total count of documents that belong to that classin training set divided  by 
the total documents in training Set
'''
def SMScomputePrior(cls):
    return SMScount[cls] / len(SMStrainingSet)


print(SMScomputePrior("spam"))

### FUNCTION FOR LIKELIHOOD

In [None]:
SMSalpha = 0.00001  # smoothing parameter

'''
This model only considers whether a word is present or absent, not how many times it occurs. So the single word likelihood is 
nothing but P(w | C) = (# documents in class C containing  w+ alpha)/ (# documents in class C + 2α) which we can easily calculate
using vocabulary 'SMSV' 
'''
def SMSsingleWordLikelihood(w, cls):
    if w in SMSV:
        return (SMSV[w][cls] + SMSalpha) / (SMScount[cls] + 2 * SMSalpha) # if the word is present in the vocabulary
    else:
        return SMSalpha / (SMScount[cls] + 2 * SMSalpha) # if the word in not present in the vocabulary





def SMSlikelihood(wd, cls):
    def indicator(wd, w):
        if w in wd:
            return 1
        else:
            return 0

    totalProb = 1
    for i in SMSV:
        wordlikelihood = SMSsingleWordLikelihood(i, cls)
        totalProb = (
            totalProb
            * ((wordlikelihood) ** (indicator(wd, i)))
            * ((1 - wordlikelihood) ** (1 - indicator(wd, i)))
        )
    return totalProb

### FUNCTION FOR SCORE

In [None]:
# calculating the posterior scores
def SMSposScores(wd, cls):
    score = math.log(SMScomputePrior(cls))
    for i in wd:
        score += math.log(SMSsingleWordLikelihood(i, cls))
    return score

## PREDICTOR FUNCTION

In [None]:
def SMSpredict(wd):
    if SMSposScores(wd, "ham") > SMSposScores(wd, "spam"):
        return "ham"
    else:
        return "spam"

## CONFUSION MATRIX

In [None]:
def SMSconfusionMatrix(testCases):  # as a list like [['ham',SMS],['spam',SMS]....]
    d = {}
    for i in ["spam", "ham"]:
        for j in ["spam", "ham"]:
            x = i + "-" + j
            d[x] = 0
    for i in testCases:
        prediction = SMSpredict(i[1])
        d[i[0] + "-" + prediction] += 1

    print("      ", "spam", "    ", "ham")
    print("---------------------------")
    print("spam", "  ", d["spam-spam"], "     ", d["spam-ham"])
    print("---------------------------")
    print("ham", "   ", d["ham-spam"], "     ", d["ham-ham"])

    print("---------------------------")
    print(
        "Accuracy is :", ((d["spam-spam"] + d["ham-ham"]) / len(testCases)) * 100, "%"
    )


SMSconfusionMatrix(SMStestSet)

## A FEW TEST CASES 

In [None]:
pdfTestCases = [
    ["spam", "Win free tickets now!!!"],
    ["ham", "Are you coming to the meeting?"],
    ["spam", "URGENT! You won $1000"],
    ["ham", "See you tomorrow at lunch"],
]

for i in pdfTestCases:
    i[1] = processData(i[1])

SMSconfusionMatrix(pdfTestCases)

## TOP 10 MOST INDICATIVE WORDS FOR EACH CLASS

In [None]:
# top 10 most indicative words for each class
def topSMSwords():
    arr = []
    for i in SMSV:
        x = SMSposScores([i], "ham") - SMSposScores([i], "spam")
        arr.append([x, i])
    arr.sort()
    cnt = 1
    print("Top 10 most Indicative words for Spam : ")
    for i in range(0, 10):
        print(i + 1, ")", arr[i][1])

    print()
    print("Top 10 most Indicative words for Ham : ")
    for i in range(len(arr) - 10, len(arr)):
        print(i - len(arr) + 11, ")", arr[i][1])

    return


topSMSwords()

# BBC CLASSIFIER

## INTRODUCTION 

In [None]:
import math
import os
import string
from collections import Counter

## SEPARATING INTO TRAINING AND TEST SET

In [None]:
basePath = "News Articles"

cntClasses = {}

BBCtrainingSet = {}
BBCtestSet = []


def BBCseparate(
    DataInTraining,
):  # please give percentage and this will be applied to all the classes meaning from every class this much percentage of data will be taken

    for i in os.listdir(basePath):
        classPath = os.path.join(basePath, i)

        cntClasses[i] = 0
        cnt = 0

        size = len(os.listdir(classPath))

        split = round((DataInTraining * size) / 100)

        cntClasses[i] = split
        for j in os.listdir(classPath):
            filePath = os.path.join(classPath, j)

            with open(filePath, "r") as f:
                cnt += 1
                content = f.read()
                content = processData(content)
                d = Counter(content)
                if cnt <= split:
                    if i in BBCtrainingSet:
                        BBCtrainingSet[i] += d
                    else:
                        BBCtrainingSet[i] = d
                else:
                    BBCtestSet.append([i, content])


BBCseparate(80)
BBCtrainingSet

## DEFINING VOCABULARY

In [None]:
BBCV = Counter()


def makeBBCvocab():
    global BBCV
    for i in BBCtrainingSet:
        BBCV += BBCtrainingSet[i]
    return


makeBBCvocab()
len(BBCV)

## CALCULATIN TOTAL WORDS IN EACH CLASS AND TOTAL DOCUMENTS IN TRAINING SET

In [None]:
totalwordsinCls = {}
Totaldoc = 0


def totalWordsInClsAndTotalDoc():
    global Totaldoc
    for i in BBCtrainingSet:
        cnt = 0
        for j in BBCtrainingSet[i]:
            cnt += BBCtrainingSet[i][j]
        totalwordsinCls[i] = cnt

    for i in cntClasses:
        Totaldoc += cntClasses[i]


totalWordsInClsAndTotalDoc()
totalwordsinCls

## FUNCTIONS TO COMPUTE PRIOR , LIKELIHOOD AND SCORE 


### FUNCTION FOR PRIOR

In [None]:
BBCalpha = 0.000001


def BBCpriorProbability(cls):
    return cntClasses[cls] / Totaldoc

### FUNCTION FOR LIKELIHOOD

In [None]:
def BBCsingleWordLikelihood(w, cls):
    return (BBCtrainingSet[cls][w] + BBCalpha) / (
        totalwordsinCls[cls] + len(BBCV) * BBCalpha
    )


def BBClikelihood(wd, cls):
    d = Counter(wd)
    totalProb = 1
    for i in wd:
        totalProb = totalProb * (BBCsingleWordLikelihood(i, cls) ** (d[i]))
    return totalProb

### FUNCTION FOR SCORE

In [None]:
def BBCposScore(wd, cls):
    score = math.log(BBCpriorProbability(cls))
    for i in wd:
        score += math.log(BBCsingleWordLikelihood(i, cls))
    return score

## PREDICTOR FUNCTION

In [None]:
def BBCpredictor(wd):
    maxi = float("-inf")
    category = ""
    for i in BBCtrainingSet:
        x = BBCposScore(wd, i)
        if x > maxi:
            maxi = x
            category = i
    return category

## CONFUSION MATRIX

In [None]:
def BBCconfusionMatrix(testCases):
    dic = {}
    for i in BBCtrainingSet:
        for j in BBCtrainingSet:
            dic[i[0] + "-" + j[0]] = 0
    for i in testCases:
        ans = BBCpredictor(i[1])
        dic[i[0][0] + "-" + ans[0]] += 1

    print("                  ","business","    ","entertainment","    ","politics","    ","sport","      ","tech")
    print("-------------------------------------------------------------------------------------")
    
    print("business","           ",dic["b-b"],"             ",dic["b-e"],"               ",dic["b-p"],"         ",dic["b-s"],"         ",dic["b-t"])
    print("-------------------------------------------------------------------------------------")

    print("entertainment","      ",dic["e-b"],"              ",dic["e-e"],"              ",dic["e-p"],"         ",dic["e-s"],"         ",dic["e-t"])
    print("-------------------------------------------------------------------------------------")
    
    print("politics","           ",dic["p-b"],"              ",dic["p-e"],"               ",dic["p-p"],"        ",dic["p-s"],"         ",dic["p-t"])
    print("-------------------------------------------------------------------------------------")
    
    print("sport","              ",dic["s-b"],"              ",dic["s-e"],"               ",dic["s-p"],"         ",dic["s-s"],"        ",dic["s-t"])
    print("-------------------------------------------------------------------------------------")
    
    print("tech","               ",dic["t-b"],"              ",dic["t-e"],"               ",dic["t-p"],"         ",dic["t-s"],"         ",dic["t-t"])
    print("-------------------------------------------------------------------------------------")

    acc = 0
    for i in dic:
        if i[0] == i[2] :
            acc += dic[i]
    print("Accuracy is :" , ( acc/len(testCases))*100,"%")
    return


BBCconfusionMatrix(BBCtestSet)

## A FEW TEST CASES

In [None]:
pdfTestCases = [
    ["business", "Stock market crashes as oil prices rise"],
    ["sport", "Premier League team wins the championship"],
    ["politics", "Government passes new healthcare reform"],
    ["tech", "Apple releases latest iPhone with new features"],
    ["entertainment", "Celebrity announces new film project"],
]
for i in pdfTestCases:
    i[1] = processData(i[1])

BBCconfusionMatrix(pdfTestCases)

## TOP 10 MOST INDICATIVE WORDS FOR EACH CLASS

In [None]:
indicative_words = {}


def topBBCwords():
    for c in BBCtrainingSet:
        word_scores = []
        for w in BBCV:
            pw_c = math.log(BBCsingleWordLikelihood(w, c))
            other_max = max(
                math.log(BBCsingleWordLikelihood(w, other))
                for other in BBCtrainingSet
                if other != c
            )
            score = pw_c - other_max
            word_scores.append((score, w))
        word_scores.sort(reverse=True)
        indicative_words[c] = [w for _, w in word_scores[:10]]

    for c in indicative_words:
        cnt = 1
        print(f"\nTop words for {c}:")
        for w in indicative_words[c]:
            print(cnt, ")", w)
            cnt += 1
    return


topBBCwords()