Check feature extraction tutorial first before this. 

After feature extraction, it will generate a csv file that has:
1. Term
2. Number of legitimate emails that word occurred in
3. Number of spam emails that word occurred in
4. Mutual Information Count

In [2]:
#imports
import pandas as pd

After computing the mutual information score of each term, we can now extract the top n highest terms based on the MI to be used as the features for the classifier. The paper used 50-700 features, step 50. 

In [8]:
numMI = [50,100,150,200,250,300,350,400,450,500,550,600,650,700]
corpus = ['bare','lemm', 'lemm_stop', 'stop']

for corp in corpus:
    for num in numMI:
        termMIList = pd.read_csv("Features/"+corp+"/"+corp+"termMI.csv", index_col = 0)
        terms = pd.DataFrame(termMIList.head(n=num)).to_csv("MI/"+corp+"/"+str(num)+"terms.csv")
print ("Done, check MI folder")

Done, check MI folder


in order to classify, insert formula hereeee

In [26]:
import math
from decimal import *

getcontext().prec = 256


# A = P(X=x|C=c)
# P(A|B), is equal to P(AB)/P(B).
# P(A) = (Total number of times x occurred/total number of term occurrence in the corpus)
# P(B) = (Total number of email c in the corpus /total number of documents in the corpus)
#termProb = total number of times x occurred  in document


def computeTermGivenClass(probClass, terms, totalTerms):
    prob = Decimal(1.0)
    for x in terms:
        a = (Decimal(x)/Decimal(totalTerms))
        prob = (Decimal(prob) * (Decimal(a)/Decimal(probClass)))
    
    return prob

#B = P(X=x) = (Total number of times x occurred in the corpus)/(Total number of word occurrence in the corpus)
#C = P(C=c) = (Total number of documents that are c in the corpus)/(Total number of documents in the corpus)
          
def computeProbability(isComputingSpam, totalDoc, spamTerms, legitTerms, totalTerms, totalLegitCount, totalSpamCount):
      
    probSpam = (totalSpamCount/totalDoc)
    probLegit = (totalLegitCount/totalDoc)
    
    givenSpam =  (Decimal(probSpam) * computeTermGivenClass(probSpam, spamTerms, totalTerms))
    givenLegit = (Decimal(probLegit) * computeTermGivenClass(probLegit, legitTerms, totalTerms))
    
    if isComputingSpam == True:
        numerator =  givenSpam
    else:
        numerator =  givenLegit
            
    denominator = Decimal(givenSpam) + Decimal(givenLegit)
    
    probClass = Decimal(numerator/denominator)
    return probClass

In [33]:
import re
def extractWords(filepath):
    file = open(filepath, 'r')
    # .lower() returns a version with all upper case characters replaced with lower case characters.
    text = file.read().lower()
    file.close()
    # replaces anything that is not a lowercase letter, a space, or an apostrophe with a space:
    text = re.sub('[^a-z]+', " ", text)
    words = list(text.split())
    
     # remove duplicate words in the list
    words = list(set(words))
    # removes words that are less than 4 letters/characters
    words =  [i for i in words if len(i) >= 4] 
    return words;

def findTermCounts(terms, docTerms):
    return terms.loc[terms['Term'].isin(docTerms)]

In [53]:
import re

numFeatures = [50,100,150,200,250,300,350,400,450,500,550,600,650,700]
# 0.5 - 1, 0.9 - 9, 0.999 - 999
threshold = [0.5, 0.9, 0.999]

pattern = re.compile("\d+-\d+msg\d+.txt")

def classify(corp):
    #load the vocabulary/word/term list for the entire corpus from file
    corpusTerms = pd.read_csv("Features/"+corp.corpusName+"/baretermMI.csv", index_col = 0)
    totalLegitTerms = corpusTerms['LegitCount'].sum(axis=0)
    totalSpamTerms = corpusTerms['SpamCount'].sum(axis=0)

    print ("Classifying Documents in Corpus: ", corp.corpusName)
    
    totalDoc = corp.totalEmailCtr
    totalLegitCount = corp.legitEmailCtr
    totalSpamCount = corp.spamEmailCtr
    
    for num in numFeatures:
        terms = pd.read_csv("MI/"+corp.corpusName+"/"+str(num)+"terms.csv", index_col = 0)
        print ("Using", len(terms), " of Features") 
        spamCount = terms['SpamCount'].tolist()
        legitCount = terms['LegitCount'].tolist()
        totalTerms = totalLegitCount + totalSpamCount
        
        for t in threshold:
            actual = []
            predicted = []
            #for each subdirectory in a corpus (folders - part 1 - 10)
            for subdir, dirs, files in os.walk(rootdir):
            #for each file in a folder
                for file in files:  
                    filepath = subdir +"/"+file       
                    
                    docTerms = extractWords(filepath)
                    rowTerms = findTermCounts(terms, docTerms)
                    PCSpam = Decimal(computeProbability(True, totalDoc, spamCount, legitCount, 
                                                        totalTerms, totalLegitCount, totalSpamCount))
                    PCLegit = Decimal(computeProbability(False, totalDoc, spamCount, legitCount, 
                                                        totalTerms, totalLegitCount, totalSpamCount))
                    
                    if PCSpam/PCLegit > t:
                        p = 0
                    else:
                        p = 1
                        
                    if pattern.match(file): 
                        a = 1
                    else:
                        a = 0
                    
                    actual.append(a)
                    predicted.append(p)
                    
        results = pandas.DataFrame(
            {'Actual': actual,
             'Predicted' : predicted,
            })

        #save the Term MI to CSV (so we can access it later)
        results.to_csv("Classified/"+corp+"/"+str(num)+"/"+str(t)+"_results.csv")
        print("File Saved: ", corp, num, t)

TypeError: Can't convert 'CorpusData' object to str implicitly

In [52]:
# create a class for the the Corpus data, it will store the total number of emails in the corpus, 
# along with the total number of spam and legit emails
class CorpusData: 
    corpusName = ""
    totalEmailCtr = 0
    spamEmailCtr = 0
    legitEmailCtr = 0

    def __init__(self, corpusName, totalEmailCtr, spamEmailCtr, legitEmailCtr):
        self.corpusName = corpusName
        self.totalEmailCtr = totalEmailCtr
        self.spamEmailCtr = spamEmailCtr
        self.legitEmailCtr = legitEmailCtr
        
# total emails, total spam emails, total legit emails
bare = CorpusData("bare", 2515, 304, 2211)
lemm = CorpusData("lemm", 2776, 452, 2324)
lemm_stop = CorpusData("lemm_stop", 2609, 281, 2409)
stop = CorpusData("stop", 2341, 481, 1860)

corpusDataList = []

corpusDataList.append(bare)
corpusDataList.append(lemm)
corpusDataList.append(lemm_stop)
corpusDataList.append(stop)

for corp in corpusDataList:
    classify(corp)
    

Classifying Documents in Corpus:  bare


TypeError: Can't convert 'CorpusData' object to str implicitly

In [41]:
import os
rootdir = "Emails/"

#for each subdirectory in a corpus (folders - part 1 - 10)
for subdir, dirs, files in os.walk(rootdir):
#for each file in a folder
    for file in files:  
        filepath = subdir +"/"+file    
        print (filepath)

Emails/stop/part4/6-812msg1.txt
Emails/stop/part4/6-327msg3.txt
Emails/stop/part4/8-1023msg1.txt
Emails/stop/part4/6-468msg3.txt
Emails/stop/part4/6-256msg1.txt
Emails/stop/part4/8-1043msg2.txt
Emails/stop/part4/6-861msg1.txt
Emails/stop/part4/8-1063msg2.txt
Emails/stop/part4/6-42msg2.txt
Emails/stop/part4/6-400msg2.txt
Emails/stop/part4/6-318msg1.txt
Emails/stop/part4/6-245msg1.txt
Emails/stop/part4/6-348msg1.txt
Emails/stop/part4/6-377msg1.txt
Emails/stop/part4/6-273msg1.txt
Emails/stop/part4/6-309msg1.txt
Emails/stop/part4/6-408msg1.txt
Emails/stop/part4/6-437msg2.txt
Emails/stop/part4/6-887msg1.txt
Emails/stop/part4/6-863msg1.txt
Emails/stop/part4/6-843msg1.txt
Emails/stop/part4/6-853msg1.txt
Emails/stop/part4/6-359msg3.txt
Emails/stop/part4/6-437msg3.txt
Emails/stop/part4/6-327msg4.txt
Emails/stop/part4/6-243msg1.txt
Emails/stop/part4/6-396msg1.txt
Emails/stop/part4/6-417msg1.txt
Emails/stop/part4/6-266msg1.txt
Emails/stop/part4/8-1066msg2.txt
Emails/stop/part4/6-456msg1.txt
Email