In [1]:
import pandas as pd
import numpy as np
import re
import os, email
import pprint

from utils import (get_text_from_email, parse_message_attachment,
                   extract_single_messages, form_message_pairs)

%load_ext autoreload
%autoreload 2

In [2]:
INPUT_FILE = 'emails.csv'

## Load the email collection

In [3]:
raw_messages = pd.read_csv(INPUT_FILE)

In [4]:
raw_messages.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


## The content of a message

In [5]:
print(raw_messages['message'][1])

Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the

## Extract the email contents

In [6]:
df_emails = raw_messages.copy()
messages = list(map(email.message_from_string, raw_messages['message']))
keys = messages[0].keys()
for key in keys:
    df_emails[key] = [doc[key] for doc in messages]
df_emails['content'] = list(map(get_text_from_email, messages))

In [7]:
df_emails['user'] = df_emails['file'].apply(lambda x: x.split('/')[0])
df_emails['folder'] = df_emails['file'].apply(lambda x: x.split('/')[1])
df_emails = df_emails.drop(['file', 'Mime-Version', 'Content-Type', 'message', 'X-Folder', 'X-FileName'],
                           axis=1)
df_emails = df_emails[['user', 'folder', 'Message-ID', 'From', 'To', 'Subject', 'content']]

In [8]:
df_emails.head()

Unnamed: 0,user,folder,Message-ID,From,To,Subject,content
0,allen-p,_sent_mail,<18782981.1075855378110.JavaMail.evans@thyme>,phillip.allen@enron.com,tim.belden@enron.com,,Here is our forecast\n\n
1,allen-p,_sent_mail,<15464986.1075855378456.JavaMail.evans@thyme>,phillip.allen@enron.com,john.lavorato@enron.com,Re:,Traveling to have a business meeting takes the...
2,allen-p,_sent_mail,<24216240.1075855687451.JavaMail.evans@thyme>,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,test successful. way to go!!!
3,allen-p,_sent_mail,<13505866.1075863688222.JavaMail.evans@thyme>,phillip.allen@enron.com,randall.gay@enron.com,,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p,_sent_mail,<30922949.1075863688243.JavaMail.evans@thyme>,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,Let's shoot for Tuesday at 11:45.


## Number of messages

In [9]:
print("# of messages:", df_emails.shape[0])

# of messages: 517401


## Drop duplicate messages (the same messages that are stored in multiple users' mailboxes)

In [10]:
df_emails = df_emails.drop_duplicates(subset='content').reset_index()
df_emails = df_emails.drop(['index'], axis=1)

In [11]:
print("# of messages:", df_emails.shape[0])

# of messages: 249025


In [22]:
f_pair = open("raw_message_pairs.csv", "w")
f_pair.write("request:##:response\n")
for i in range(df_emails.shape[0]):
    messages = extract_single_messages(df_emails['content'][i])
    if len(messages) > 1:
        pairs = form_message_pairs(messages)
        for j in range(len(pairs)):
            pair = pairs[j]
            f_pair.write("{}:##:{}\n".format(pair[0].strip(), pair[1].strip()))
f_pair.close()

In [23]:
raw_pairs = pd.read_csv("raw_message_pairs.csv", sep=":##:", engine='python')

In [24]:
raw_pairs.head()

Unnamed: 0,request,response
0,John Lavorato-M Mike Grigsby-D Keith Holst-D ...,Program Importance: High Hi Phillip. We app...
1,Program Importance: High Hi Phillip. We app...,Program We have not received your completed ...
2,John Lavorato-M Mike Grigsby-D Keith Holst-D ...,Hi Phillip. We appreciate your prompt attenti...
3,"> > > > George, > > Can you please call m...",I'll get back to them on this. I know we have...
4,"> > > > George, > > Can you please call m...",I'll get back to them on this. I know we have...


In [25]:
raw_pairs.shape[0]

116080

In [34]:
pairs = raw_pairs.drop_duplicates().reset_index()
pairs = pairs.drop(['index'], axis=1)

f_pair = open("message_pairs.csv", "w")
f_pair.write("request:##:response\n")
for i in range(pairs.shape[0]):
    request = pairs["request"][i]
    response = pairs["response"][i]
    f_pair.write("{}:##:{}\n".format(request, response))
f_pair.close()