In [3]:
pip install pandas

Collecting pandas
  Using cached pandas-1.1.2-cp37-cp37m-win32.whl (8.1 MB)
Collecting pytz>=2017.2
  Using cached pytz-2020.1-py2.py3-none-any.whl (510 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.2 pytz-2020.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install nltk

Collecting nltk
  Using cached nltk-3.5.zip (1.4 MB)
Collecting click
  Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting regex
  Using cached regex-2020.7.14-cp37-cp37m-win32.whl (252 kB)
Collecting tqdm
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
Using legacy 'setup.py install' for nltk, since package 'wheel' is not installed.
Installing collected packages: click, regex, tqdm, nltk
    Running setup.py install for nltk: started
    Running setup.py install for nltk: finished with status 'done'
Successfully installed click-7.1.2 nltk-3.5 regex-2020.7.14 tqdm-4.49.0
Note: you may need to restart the kernel to use updated packages.


In [43]:
import math
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import argparse
from collections import Counter
from nltk.stem import PorterStemmer

In [53]:
def get_terms(email):
    """1) Lowercase the email
    2) Tokenize the email into words
    3) Remove stop words and punctuation
    """
    ps=PorterStemmer()
    tokens = word_tokenize(email.lower())
    words_to_remove = stopwords.words("english")
    words_to_remove.extend(list(punctuation))
    return [ps.stem(t) for t in set(tokens) if not t in words_to_remove]


In [45]:
def get_term_doc(emails):
    """Turn tokenized emails into a term-document dictionary"""
    term_doc = {}
    for i, email in enumerate(emails):
        for term in email:
            if term in term_doc:
                term_doc[term].add(i)
            else:
                term_doc[term] = {i}
    return term_doc

In [46]:
def compute_idfs(term_doc, corpus_size):
    """
    Calculate idfs for all the terms
    idf = log2 (corpus size / doc freq)
    """
    idfs = {}
    for term in term_doc:
        doc_freq = len(term_doc[term])
        idfs[term] = math.log2(corpus_size / doc_freq)
    return idfs

In [47]:
def get_tfidfs(input_email):
    """Get the list of terms and corresponding
    tf-idf scores"""
    with open("idf_results.p", "rb") as idf_pickle:
        idfs = pickle.load(idf_pickle)
    tokens = word_tokenize(input_email.lower())
    tfs = Counter(tokens)
    tfidfs = {}
    for term in tfs:
        if term in idfs:
            tfidfs[term] = tfs[term] * idfs[term]
    counter = Counter(tfidfs).most_common()
    return dict(counter)

In [48]:
def print_result(filename):
    with open(filename) as file:
        input_email = file.read()
    tfidfs = get_tfidfs(input_email)
    print(tfidfs)

In [55]:
if __name__ == "__main__":
    # reading in the E-mails from the Hillary Clinton email corpus
    emails = (pd.read_csv("Emails.csv")
              ["ExtractedBodyText"]
              .dropna()
              
              .tolist())
    tokenized_emails = [get_terms(email) for email in emails]
#     print(tokenized_emails[:20])
    term_doc = get_term_doc(tokenized_emails)
    idfs = compute_idfs(term_doc, len(emails))

    # we will use this pickle in the get_tfidfs.py script
    with open("idf_results.p", "wb") as idf_file:
        pickle.dump(idfs, idf_file)
    print_result("aws_mail.txt")

{'amazon': 42.87584364793684, 'we': 35.15688273595263, 'web': 33.3065739568218, 'survey': 33.038117173387654, 'on': 25.43792182396842, 'will': 21.58619686778159, 'be': 20.794065634193696, 'of': 12.71896091198421, 'an': 12.71896091198421, 'not': 12.71896091198421, 'share': 11.721959833713276, 'in': 11.133998411263054, "'re": 10.422332543571027, 'Â»': 10.397032817096848, 'do': 8.71896091198421, 'take': 8.56866536869497, 'feedback': 7.860979916856638, 'enter': 7.509507546355261, 'dear': 7.133998411263054, 'time': 6.918435296586858, 'other': 5.480556172659131, 'better': 5.424340163092583, 'team': 5.203261073700168, "'ll": 4.702152624297656, 'much': 4.669112362533649, 'thank': 4.203261073700168, 'need': 3.8241431486762667}
