Check feature extraction tutorial first before this. 

After feature extraction, it will generate a csv file that has:
1. Term
2. Number of legitimate emails that word occurred in
3. Number of spam emails that word occurred in
4. Mutual Information Count

In [10]:
#imports
import pandas as pd
import os

After computing the mutual information score of each term, we can now extract the top n highest terms based on the MI to be used as the features for the classifier. The paper used 50-700 features, step 50. 

In [2]:
numMI = [50,100,150,200,250,300,350,400,450,500,550,600,650,700]
corpus = ['bare','lemm', 'lemm_stop', 'stop']

for corp in corpus:
    for num in numMI:
        termMIList = pd.read_csv("Features/"+corp+"/"+corp+"termMI.csv", index_col = 0)
        terms = pd.DataFrame(termMIList.head(n=num)).to_csv("MI/"+corp+"/"+str(num)+"terms.csv")
print ("Done, check MI folder")

Done, check MI folder


in order to classify, insert formula hereeee

In [66]:
import math
from decimal import *

getcontext().prec = 256
getcontext().rounding = ROUND_UP

# A = P(X=x|C=c)
# P(A|B), is equal to P(AB)/P(B).
# P(A) = (Total number of times x occurred/total number of term occurrence in the corpus)
# P(B) = (Total number of email c in the corpus /total number of documents in the corpus)
#termProb = total number of times x occurred  in document


def computeTermGivenClass(probClass, terms, totalTerms):
    prob = Decimal(1.0)
    for x in terms:
        a = (Decimal(x)/Decimal(totalTerms))
        prob = (Decimal(prob) * (Decimal(a)/Decimal(probClass)))
    
    return prob

#B = P(X=x) = (Total number of times x occurred in the corpus)/(Total number of word occurrence in the corpus)
#C = P(C=c) = (Total number of documents that are c in the corpus)/(Total number of documents in the corpus)
          
def computeProbability(isComputingSpam, totalDoc, spamTerms, legitTerms, totalTerms, totalLegitCount, totalSpamCount):
      
    probSpam = (totalSpamCount/totalDoc)
    probLegit = (totalLegitCount/totalDoc)
    
    givenSpam =  (Decimal(probSpam) * computeTermGivenClass(probSpam, spamTerms, totalTerms))
    givenLegit = (Decimal(probLegit) * computeTermGivenClass(probLegit, legitTerms, totalTerms))
    
    if isComputingSpam == True:
        numerator =  givenSpam
    else:
        numerator =  givenLegit
            
    denominator = Decimal(givenSpam) + Decimal(givenLegit)
    try:
        probClass = Decimal(numerator/denominator)
    except:
        probClass = 0
    return probClass

In [4]:
import re
def extractWords(filepath):
    file = open(filepath, 'r')
    # .lower() returns a version with all upper case characters replaced with lower case characters.
    text = file.read().lower()
    file.close()
    # replaces anything that is not a lowercase letter, a space, or an apostrophe with a space:
    text = re.sub('[^a-z]+', " ", text)
    words = list(text.split())
    
     # remove duplicate words in the list
    words = list(set(words))
    # removes words that are less than 4 letters/characters
    words =  [i for i in words if len(i) >= 4] 
    return words;

def findTermCounts(terms, docTerms):
    return terms.loc[terms['Term'].isin(docTerms)]

In [69]:
def classifyDocs(corp, terms, spamCount, legitCount, totalTerms,
             fileList, threshold, numFeatures):
    
    totalDoc = corp.totalEmailCtr
    totalLegitCount = corp.legitEmailCtr
    totalSpamCount = corp.spamEmailCtr

    predicted = []
    for filepath in fileList:   

        docTerms = extractWords(filepath)
        rowTerms = findTermCounts(terms, docTerms)
        PCSpam = Decimal(computeProbability(True, totalDoc, spamCount, legitCount, 
                                            totalTerms, totalLegitCount, totalSpamCount))
        PCLegit = Decimal(computeProbability(False, totalDoc, spamCount, legitCount, 
                                            totalTerms, totalLegitCount, totalSpamCount))
        
        try:
            ifSpam = PCSpam/PCLegit
        except:
            ifSpam = 0
            
        if  ifSpam > threshold:
            predicted.append(0)
        else:
            predicted.append(1)
    
    return predicted

In [8]:
# create a class for the the Corpus data, it will store the total number of emails in the corpus, 
# along with the total number of spam and legit emails
class CorpusData: 
    corpusName = ""
    totalEmailCtr = 0
    spamEmailCtr = 0
    legitEmailCtr = 0

    def __init__(self, corpusName, totalEmailCtr, spamEmailCtr, legitEmailCtr):
        self.corpusName = corpusName
        self.totalEmailCtr = totalEmailCtr
        self.spamEmailCtr = spamEmailCtr
        self.legitEmailCtr = legitEmailCtr   

In [19]:
def combineFiles(corp):
    #for each subdirectory in a corpus (folders - part 1 - 10)
    fileList = []
    rootdir = "Emails/"+corp
    actualClass = []
    for subdir, dirs, files in os.walk(rootdir):
    #for each file in a folder
        for file in files:
            filepath = subdir + "/" + file
            fileList.append(filepath)
            
            if pattern.match(file): 
                    actualClass.append(1)
            else:
                    actualClass.append(0)
    
    return fileList, actualClass

In [50]:
def writeResults(actual, predicted, corp, featureNum, thresh):
    results = pd.DataFrame(
        {'Actual': actual,
         'Predicted' : predicted,
        })

    #save the Term MI to CSV (so we can access it later)
    results.to_csv("Classified/"+corp.corpusName +"/"+str(featureNum)+"/"+str(thresh)+"_results.csv")
    print("File Saved: ", corp.corpusName , featureNum, thresh)

In [29]:
# total emails, total spam emails, total legit emails
bare = CorpusData("bare", 2515, 304, 2211)
lemm = CorpusData("lemm", 2776, 452, 2324)
lemm_stop = CorpusData("lemm_stop", 2609, 281, 2409)
stop = CorpusData("stop", 2341, 481, 1860)


numFeatures = [50,100,150,200,250,300,350,400,450,500,550,600,650,700]
# 0.5 - 1, 0.9 - 9, 0.999 - 999
threshold = [0.5, 0.9, 0.999]


In [49]:
def classify(corp):
    print ("Classifying", corp.corpusName)
    #load the vocabulary/word/term list for the entire corpus from file
    corpusTerms = pd.read_csv("Features/"+corp.corpusName+"/"+corp.corpusName+"termMI.csv", index_col = 0)
    totalLegitTerms = corpusTerms['LegitCount'].sum(axis=0)
    totalSpamTerms = corpusTerms['SpamCount'].sum(axis=0)
    totalTerms = totalLegitTerms + totalSpamTerms
    
    filepathList, actualClass = combineFiles("bare")
    for num in numFeatures:  
        terms = pd.read_csv("MI/"+corp.corpusName+"/"+str(num)+"terms.csv", index_col = 0)
        print ("Features: ", num) 
        
        spamCount = terms['SpamCount'].tolist()
        legitCount = terms['LegitCount'].tolist()
        
        for t in threshold:
            print ("Threshold: ", t)
            predClass = classifyDocs(corp, terms, spamCount, legitCount, totalTerms,
                                 filepathList, t, num)
            writeResults(actualClass,predClass, corp, num, t)

In [51]:
classify(bare)

Classifying bare
Features:  50
Threshold:  0.5
File Saved:  bare 50 0.5
Threshold:  0.9
File Saved:  bare 50 0.9
Threshold:  0.999
File Saved:  bare 50 0.999
Features:  100
Threshold:  0.5
File Saved:  bare 100 0.5
Threshold:  0.9
File Saved:  bare 100 0.9
Threshold:  0.999
File Saved:  bare 100 0.999
Features:  150
Threshold:  0.5
File Saved:  bare 150 0.5
Threshold:  0.9
File Saved:  bare 150 0.9
Threshold:  0.999
File Saved:  bare 150 0.999
Features:  200
Threshold:  0.5
File Saved:  bare 200 0.5
Threshold:  0.9
File Saved:  bare 200 0.9
Threshold:  0.999
File Saved:  bare 200 0.999
Features:  250
Threshold:  0.5
File Saved:  bare 250 0.5
Threshold:  0.9
File Saved:  bare 250 0.9
Threshold:  0.999
File Saved:  bare 250 0.999
Features:  300
Threshold:  0.5
File Saved:  bare 300 0.5
Threshold:  0.9
File Saved:  bare 300 0.9
Threshold:  0.999
File Saved:  bare 300 0.999
Features:  350
Threshold:  0.5
File Saved:  bare 350 0.5
Threshold:  0.9
File Saved:  bare 350 0.9
Threshold:  0.999


In [70]:
print ("Classifying", bare.corpusName)
#load the vocabulary/word/term list for the entire corpus from file
corpusTerms = pd.read_csv("Features/"+bare.corpusName+"/"+bare.corpusName+"termMI.csv", index_col = 0)
totalLegitTerms = corpusTerms['LegitCount'].sum(axis=0)
totalSpamTerms = corpusTerms['SpamCount'].sum(axis=0)
totalTerms = totalLegitTerms + totalSpamTerms

filepathList, actualClass = combineFiles("bare")
num = 400 
terms = pd.read_csv("MI/"+bare.corpusName+"/"+str(num)+"terms.csv", index_col = 0)
print ("Features: ", num) 

spamCount = terms['SpamCount'].tolist()
legitCount = terms['LegitCount'].tolist()

t = 0.5
print ("Threshold: ", t)
predClass = classifyDocs(bare, terms, spamCount, legitCount, totalTerms,
                     filepathList, t, num)
writeResults(actualClass,predClass, bare, num, t)

Classifying bare
Features:  400
Threshold:  0.5
File Saved:  bare 400 0.5


In [71]:
classify(lemm)

Classifying lemm
Features:  50
Threshold:  0.5
File Saved:  lemm 50 0.5
Threshold:  0.9
File Saved:  lemm 50 0.9
Threshold:  0.999
File Saved:  lemm 50 0.999
Features:  100
Threshold:  0.5
File Saved:  lemm 100 0.5
Threshold:  0.9
File Saved:  lemm 100 0.9
Threshold:  0.999
File Saved:  lemm 100 0.999
Features:  150
Threshold:  0.5
File Saved:  lemm 150 0.5
Threshold:  0.9
File Saved:  lemm 150 0.9
Threshold:  0.999
File Saved:  lemm 150 0.999
Features:  200
Threshold:  0.5
File Saved:  lemm 200 0.5
Threshold:  0.9
File Saved:  lemm 200 0.9
Threshold:  0.999
File Saved:  lemm 200 0.999
Features:  250
Threshold:  0.5
File Saved:  lemm 250 0.5
Threshold:  0.9
File Saved:  lemm 250 0.9
Threshold:  0.999
File Saved:  lemm 250 0.999
Features:  300
Threshold:  0.5
File Saved:  lemm 300 0.5
Threshold:  0.9
File Saved:  lemm 300 0.9
Threshold:  0.999
File Saved:  lemm 300 0.999
Features:  350
Threshold:  0.5
File Saved:  lemm 350 0.5
Threshold:  0.9
File Saved:  lemm 350 0.9
Threshold:  0.999


In [72]:
classify(stop)

Classifying stop
Features:  50
Threshold:  0.5
File Saved:  stop 50 0.5
Threshold:  0.9
File Saved:  stop 50 0.9
Threshold:  0.999
File Saved:  stop 50 0.999
Features:  100
Threshold:  0.5
File Saved:  stop 100 0.5
Threshold:  0.9
File Saved:  stop 100 0.9
Threshold:  0.999
File Saved:  stop 100 0.999
Features:  150
Threshold:  0.5
File Saved:  stop 150 0.5
Threshold:  0.9
File Saved:  stop 150 0.9
Threshold:  0.999
File Saved:  stop 150 0.999
Features:  200
Threshold:  0.5
File Saved:  stop 200 0.5
Threshold:  0.9
File Saved:  stop 200 0.9
Threshold:  0.999
File Saved:  stop 200 0.999
Features:  250
Threshold:  0.5
File Saved:  stop 250 0.5
Threshold:  0.9
File Saved:  stop 250 0.9
Threshold:  0.999
File Saved:  stop 250 0.999
Features:  300
Threshold:  0.5
File Saved:  stop 300 0.5
Threshold:  0.9
File Saved:  stop 300 0.9
Threshold:  0.999
File Saved:  stop 300 0.999
Features:  350
Threshold:  0.5
File Saved:  stop 350 0.5
Threshold:  0.9
File Saved:  stop 350 0.9
Threshold:  0.999


In [1]:
classify(lemm_stop)

Classifying lemm_stop
Features:  50
Threshold:  0.5
File Saved:  stop 50 0.5
Threshold:  0.9
File Saved:  stop 50 0.9
Threshold:  0.999
File Saved:  stop 50 0.999
Features:  100
Threshold:  0.5
File Saved:  stop 100 0.5
Threshold:  0.9
File Saved:  stop 100 0.9
Threshold:  0.999
File Saved:  stop 100 0.999
Features:  150
Threshold:  0.5
File Saved:  stop 150 0.5
Threshold:  0.9
File Saved:  stop 150 0.9
Threshold:  0.999
File Saved:  stop 150 0.999
Features:  200
Threshold:  0.5
File Saved:  stop 200 0.5
Threshold:  0.9
File Saved:  stop 200 0.9
Threshold:  0.999
File Saved:  stop 200 0.999
Features:  250
Threshold:  0.5
File Saved:  stop 250 0.5
Threshold:  0.9
File Saved:  stop 250 0.9
Threshold:  0.999
File Saved:  stop 250 0.999
Features:  300
Threshold:  0.5
File Saved:  stop 300 0.5
Threshold:  0.9
File Saved:  stop 300 0.9
Threshold:  0.999
File Saved:  stop 300 0.999
Features:  350
Threshold:  0.5
File Saved:  stop 350 0.5
Threshold:  0.9
File Saved:  stop 350 0.9
Threshold:  0