In [12]:
import os

from email.parser import Parser

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anthony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anthony\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def getListOfEmails():
    dirName = 'maildir2'

    listOfFiles = []

    for (dirpath, dirnames, filenames) in os.walk(dirName):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]

    return listOfFiles

In [14]:
def parseEmail(filename):
    content = processEmail(filename)
    data = tokenizeEmail(content)
    data = casefoldEmail(data)
    data = stopwordRemoval(data)
    data = stemEmail(data)

    return data

def processEmail(filename):
   
    sender = ''
    reciever = ''
    subject = ''
    content = ''

    with open(filename, 'r') as f:
        data = f.read()
        email = Parser().parsestr(data)
        content =  email.get_payload()

    return content 

def tokenizeEmail(content):

    tokens = word_tokenize(content)

    return tokens

def casefoldEmail(tokens):

    caseFold = []

    for token in tokens:
        caseFold.append(token.casefold())

    return caseFold

def stopwordRemoval(caseFold):

    stop_words = set(stopwords.words('english'))

    stopWordsRemoved = []

    for word in caseFold: 
        if word not in stop_words:
            stopWordsRemoved.append(word)

    return stopWordsRemoved
    
def stemEmail(stopWordsRemoved):

    porter = PorterStemmer()

    stemmed = []

    for term in stopWordsRemoved:
        stemmed.append(porter.stem(term))   
    
    return stemmed



In [15]:
def getUniqueTokens(listOfFiles):
    
    uniqueTokens= []

    for file in listOfFiles:
        tokens = parseEmail(file)

        for token in tokens:
            if token not in uniqueTokens:
                uniqueTokens.append(token)
        
    return uniqueTokens
        

In [16]:
emails = []

# Unique Terms
dictionary = []

listOfFiles = getListOfEmails()

for file in listOfFiles:
    tcss = parseEmail(file)
    emails.append(tcss)
    
    for term in tcss:
        if term not in dictionary:
            dictionary.append(term)

glossary = {}

termNum = 0

for word in dictionary:
    glossary[word] = termNum 
    termNum += 1
       
#wordBank = getUniqueTokens(listOfFiles)

#print(glossary)

In [17]:
def getDF():
    docFreq = {}

    for term in dictionary:
        docFreq[term] = 0

        for email in emails:
            if term in email:
                docFreq[term] += 1
    
    return docFreq

DF = getDF()

def getTF(doc, term):

    numOfWords = len(doc)

    occurance = len([token for token in doc if token == term])

    TF = occurance/numOfWords

    return TF

def getIDF(term):

    try: 
        occurance = DF[term] + 1
    except:
        occurance = 1
    
    IDF = np.log2(len(emails)/float(occurance))

    #print("Term: ", term, " Doc: ", DF[term], " IDF: ", IDF)
    
    return IDF

In [26]:
def computeTFIDF(doc):

    TFIDF = np.zeros((len(dictionary),))

    #print(TFIDF)

    for term in doc:
        vecTF = getTF(doc, term)
        vecIDF = getIDF(term)
        TFIDF[glossary[term]] = vecTF * vecIDF

    return TFIDF

# Storing the DOC TFIDF 

emailTFIDF = []

for email in emails:
    vector = computeTFIDF(email)
    emailTFIDF.append(vector)
    
print(emailTFIDF)

[array([0.65211209, 1.3819013 , 1.3819013 , ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 