In [None]:
"""
  Part 1: Create dataset using trec07_spam from: http://plg.uwaterloo.ca/~gvcormac/treccorpus07/. 
  Clean the text from each email and save to use later. 
  
  author: MP
  date: 4-22-2021

"""

In [1]:
import email
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mplat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
""" Functions to read and clean the corpus. """


def read_index(file_path):
    """ Read in index file. """
    
    ind = []
    with open(file_path, 'r', encoding='ISO-8859-1') as f:
        lines = f.readlines()
        for line in lines:
            ind.append(line)
    print("Extracted {} documents from the corpus".format(len(ind)))
    return ind


def gather_email_content(c):
    """ 
      Use email API get_payload() to extract the text from the email.
      Use BeautifulSoup API get_text() when we have html.
    """
    
    # a list to save everything
    collected_content = []
    
    # if the payload just returns a string, only append that
    if type(c) == str:
        collected_content.append(c)
        
    # if the payload returns a list, it might have multiparts
    # recursivly add each part
    elif type(c) == list:
        for each in c:
            if each.is_multipart():
                collected_content += gather_email_content(each.get_payload())
            else:
                collected_content += gather_email_content(each)
    
    # append the plain or html after recursive return
    elif c.get_content_type().split(' ')[0] == 'text/plain':
        if type(c.get_payload()) == str:
            collected_content.append(c.get_payload())
    elif c.get_content_type().split(' ')[0] == 'text/html':
        soup = BeautifulSoup(c.get_payload(), 'html.parser')
        txt = soup.get_text()
        if type(txt) == str and not txt == None:
            collected_content.append(txt)
            
    return ''.join(collected_content)


def extract_document(m):
    """ Extract all of the context that we need from the email file. """
    
    content = m.get_payload()
    body = gather_email_content(content)

    return m['Subject'], body


def clean_document(s):
    """ Clean the text to remove punctuation. """
    toks = word_tokenize(s)
    clean_toks = []
    for each in toks:
        if each.isalpha():
            clean_toks.append(each)
    return " ".join(clean_toks)


def create_dataset(file_path):
    """ Creates a dataframe of the index. """
    
    # read in the index content as a list of tuples (label, path)
    index = read_index(file_path)
    
    # initialize dictionary to hold information, later will put in pd dataframe
    data = {}
    
    # loop for each email document in the index
    for line in tqdm(index):
        
        # extract label and corresponding email path 
        line_list = line.split()
        label = line_list[0]
        doc_path = 'C:/6200-IR/homework-7-mplatt27//trec07p' + line_list[1][2:]
        
        # open email file
        doc_id_s = doc_path.find('inmail.') + len('inmail.')
        doc_id = doc_path[doc_id_s:]
        with open(doc_path, 'r', encoding='ISO-8859-1') as f:
            
            # get raw content of email
            raw_doc = f.read()
            
            # get the subject and email content 
            msg = email.message_from_string(raw_doc)
            subject, doc = extract_document(msg)
            
            
            
            # clean the email content
            clean_doc = clean_document(doc)
            
            # add to data dictionary
            data[doc_id] = {'doc_path' : doc_path, 'raw_doc': raw_doc, 'subject': subject, 
                           'clean_doc': clean_doc, 'label': label}
            f.close()
            
    # place into dataframe and return
    df = pd.DataFrame.from_dict(data, orient='index')
    return df
                  

In [5]:
""" Main code """
PTH = "C:/6200-IR/homework-7-mplatt27/trec07p/full/corpus_index.txt"
corpus_df = create_dataset(PTH)

  0%|                                                                               | 17/75419 [00:00<15:51, 79.24it/s]

Extracted 75419 documents from the corpus


100%|████████████████████████████████████████████████████████████████████████████| 75419/75419 [12:55<00:00, 97.30it/s]


In [6]:
corpus_df.isna().any()

doc_path     False
raw_doc      False
subject       True
clean_doc    False
label        False
dtype: bool

In [7]:
corpus_df.head()

Unnamed: 0,doc_path,raw_doc,subject,clean_doc,label
1,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From RickyAmes@aol.com Sun Apr 8 13:07:32 20...,"Generic Cialis, branded quality@",Do you feel the pressure to perform and not ri...,spam
2,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From bounce-debian-mirrors=ktwarwic=speedy.uwa...,Typo in /debian/README,Hi i just updated from the gulus and I check o...,ham
3,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From 7stocknews@tractionmarketing.com Sun Apr...,authentic viagra,Mega authenticV I A G R A DISCOUNT priceC I A ...,spam
4,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From vqucsmdfgvsg@ruraltek.com Sun Apr 8 13:...,Nice talking with ya,Hey Billy it was really fun going out the othe...,spam
5,C:/6200-IR/homework-7-mplatt27//trec07p/data/i...,From dcube@totalink.net Sun Apr 8 13:19:30 2...,or trembling; stomach cramps; trouble in sleep...,system of the home It will have the capabiliti...,spam


In [8]:
corpus_df.to_csv("C:/6200-IR/homework-7-mplatt27/corpus_df_clean.csv", header=True)