In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

Here we are first iterating over all sent mails present in the dataset (all sent mails are present in
the _sent_mail ,sent and sent_items folders). For each of the email we are extracting the sender name ,message id 
and body part of the mail. Also ,using the text cleaning function 
we are cleaning the body part of the mail using regex matching to remove any forwarded text that is present.

In [2]:
def text_cleaning(raw_text):
    
    #First extract body of the email
    if re.search(r"X-FileName:.+\n+((.+\n\n?)+)", raw_text):
        text = re.search(r"X-FileName:.+\n+((?:.+\n\n?)+)", raw_text).group(1)
        
        #If the body starts with ---Forwarded return np.NaN since it does not have any useful information
        if re.match(r"-{3,} Forwarded by.+\d+\/\d+\/\d+", text):
            return np.NaN
        
        #If rest of the body of the email has forwarding in it, remove that
        if re.search(r"-{3,} Forwarded by.+\d+\/\d+\/\d+[\w\W]+", text):
            text = re.sub(r"-{3,} Forwarded by.+\d+\/\d+\/\d+[\w\W]+", "", text)
        
        #If rest of the body of the email has original message in it, remove that
        if re.search(r"-{5,}Original Message-{5,}[\w\W].*", text):
            text = re.sub(r"-{5,}Original Message-{5,}[\w\W]*[\n]*.*", "", text)
            
        if text:
            return text
        else:
            return np.NaN
        
    elif re.search(r"X-FileName:.+\n+(.+)", raw_text):
        text = re.search(r"X-FileName:.+\n+(.+)", raw_text).group(1)
        
        #If the body starts with ---Forwarded return np.NaN since it does not have any useful information
        if re.match(r"-{3,} Forwarded by.+\d+\/\d+\/\d+", text):
            return np.NaN
    
        #If rest of the body of the email has forwarding in it, remove that
        if re.search(r"-{3,} Forwarded by.+\d+\/\d+\/\d+[\w\W]+", text):
            text = re.sub(r"-{3,} Forwarded by.+\d+\/\d+\/\d+[\w\W]+", "", text)
        
        #If rest of the body of the email has original message in it, remove that
        if re.search(r"-{5,}Original Message-{5,}[\w\W].*", text):
            text = re.sub(r"-{5,}Original Message-{5,}[\w\W]*[\n]*.*", "", text)
            
        if text:
            return text
        else:
            return np.NaN
    
    else:
        return np.NaN
    
def read_email(file):
    with open(file, mode="r") as f:
        try:
            raw_text = f.read()
        except UnicodeDecodeError as e:
            return None, None, None, None
    if re.search(r"^Message-ID: <(\d+\.\d+\.)JavaMail\.evans@thyme>", raw_text):
        email_id = re.search(r"^Message-ID: <(\d+\.\d+\.)JavaMail\.evans@thyme>", raw_text).group(1)
    else:
        raise Exception("For file {} Message ID could not be found".format(file))
    if re.search(r"X-From: (.+) <?", raw_text):
        sender = re.search(r"X-From: (.+) <?", raw_text).group(1)
    else:
        print("For file {} Sender could not be found".format(file))
        sender = np.NaN
    
    text = text_cleaning(raw_text)
    
    return email_id, sender, text, raw_text

def email_extraction(author):
    root_folder = "./maildir/"
    folders = ["/_sent_mail/", "/sent/", "/sent_items/"]
    extract_data = []
    for folder in folders:
        try:
            for message in os.listdir(root_folder + author + folder):
                if os.path.isfile(root_folder + author + folder + message):
                    email_id, sender, text, raw_text = read_email(root_folder + author + folder + message)
                    if email_id != None:
                        extract_data.append([sender, author, message, email_id, text, raw_text])
        except FileNotFoundError as e:
            continue
        except NotADirectoryError as e:
            continue
    return extract_data


Here we are calling the above functions and storing the output in a dataframe. Then we are removing duplicate emails from the dataset and any empty emails that are present

In [3]:
#Insert path to the mail directory here
authors = os.listdir("./maildir")
number_author_folders = len(authors)
df = pd.DataFrame(columns=["Author", "Folder", "File", "Message ID", "Text", "Raw Text", "Email Folder"])

for author in authors:
    emails = email_extraction(author)
    if emails:
        df = df.append(pd.DataFrame(emails, columns=["Author", "Folder", "File", "Message ID", "Text", "Raw Text"]))

df = df.drop_duplicates(["Message ID"])
df = df[df["Text"].notna()]
# Taking only the top 20 authors
print(df.value_counts(["Folder"])[:20])
df

For file ./maildir/forney-j/sent_items/158. Sender could not be found
For file ./maildir/germany-c/_sent_mail/1309. Sender could not be found
For file ./maildir/germany-c/sent_items/1026. Sender could not be found
For file ./maildir/tholt-j/sent_items/1. Sender could not be found
For file ./maildir/weldon-c/sent_items/27. Sender could not be found
For file ./maildir/ybarbo-p/sent_items/34. Sender could not be found
For file ./maildir/presto-k/sent_items/1103. Sender could not be found
For file ./maildir/scott-s/sent_items/3. Sender could not be found
For file ./maildir/mckay-j/sent_items/179. Sender could not be found
For file ./maildir/brawner-s/sent_items/7. Sender could not be found
For file ./maildir/hodge-j/sent_items/14. Sender could not be found
For file ./maildir/may-l/sent_items/19. Sender could not be found
For file ./maildir/quigley-d/sent_items/37. Sender could not be found
For file ./maildir/lenhart-m/sent_items/235. Sender could not be found
For file ./maildir/maggi-m/sen

Unnamed: 0,Author,Folder,File,Message ID,Text,Raw Text,Email Folder
0,John,arnold-j,36.,33491127.1075857594966.,Hey:\nHaven't had the best of months. Like yo...,Message-ID: <33491127.1075857594966.JavaMail.e...,
1,John,arnold-j,667.,6384662.1075857656041.,Thank you.\n\n,Message-ID: <6384662.1075857656041.JavaMail.ev...,
2,John,arnold-j,759.,21884118.1075857658063.,a couple of observations from here:\ncash/futu...,Message-ID: <21884118.1075857658063.JavaMail.e...,
3,John,arnold-j,313.,11352651.1075857600972.,"Frank:\nThe $5,000,000 extra VAR disappears in...",Message-ID: <11352651.1075857600972.JavaMail.e...,
4,John,arnold-j,710.,25732708.1075857656969.,don't care about the front. i think its vulne...,Message-ID: <25732708.1075857656969.JavaMail.e...,
...,...,...,...,...,...,...,...
119,"Scholtes, Diana",scholtes-d,51.,14805735.1075840025496.,I received your message and those times are go...,Message-ID: <14805735.1075840025496.JavaMail.e...,
120,"Scholtes, Diana",scholtes-d,18.,20865574.1075840024696.,Our Purchases:\n\n11/1\tTo PSE-EPMI MC - you h...,Message-ID: <20865574.1075840024696.JavaMail.e...,
122,"Scholtes, Diana",scholtes-d,8.,16491778.1075840024450.,1\n\n,Message-ID: <16491778.1075840024450.JavaMail.e...,
123,"Scholtes, Diana",scholtes-d,124.,17311779.1075840027415.,"Stewart, \n \nI left you an envelope with info...",Message-ID: <17311779.1075840027415.JavaMail.e...,


Storing this data to a csv file which will be used in the later notebooks.

In [4]:
df.to_csv("./enron.csv")

In [5]:
df

Unnamed: 0,Author,Folder,File,Message ID,Text,Raw Text,Email Folder
0,John,arnold-j,36.,33491127.1075857594966.,Hey:\nHaven't had the best of months. Like yo...,Message-ID: <33491127.1075857594966.JavaMail.e...,
1,John,arnold-j,667.,6384662.1075857656041.,Thank you.\n\n,Message-ID: <6384662.1075857656041.JavaMail.ev...,
2,John,arnold-j,759.,21884118.1075857658063.,a couple of observations from here:\ncash/futu...,Message-ID: <21884118.1075857658063.JavaMail.e...,
3,John,arnold-j,313.,11352651.1075857600972.,"Frank:\nThe $5,000,000 extra VAR disappears in...",Message-ID: <11352651.1075857600972.JavaMail.e...,
4,John,arnold-j,710.,25732708.1075857656969.,don't care about the front. i think its vulne...,Message-ID: <25732708.1075857656969.JavaMail.e...,
...,...,...,...,...,...,...,...
119,"Scholtes, Diana",scholtes-d,51.,14805735.1075840025496.,I received your message and those times are go...,Message-ID: <14805735.1075840025496.JavaMail.e...,
120,"Scholtes, Diana",scholtes-d,18.,20865574.1075840024696.,Our Purchases:\n\n11/1\tTo PSE-EPMI MC - you h...,Message-ID: <20865574.1075840024696.JavaMail.e...,
122,"Scholtes, Diana",scholtes-d,8.,16491778.1075840024450.,1\n\n,Message-ID: <16491778.1075840024450.JavaMail.e...,
123,"Scholtes, Diana",scholtes-d,124.,17311779.1075840027415.,"Stewart, \n \nI left you an envelope with info...",Message-ID: <17311779.1075840027415.JavaMail.e...,
