In [1]:
import pandas as pd
import numpy as np
import re
import os, email
import pprint

from utils import get_text_from_email, parse_message_attachment

%load_ext autoreload
%autoreload 2

In [2]:
INPUT_FILE = 'emails.csv'

# Read emails from the emails.csv

In [3]:
raw_messages = pd.read_csv(INPUT_FILE)

## Top 5 messages

In [4]:
raw_messages.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


## The content of a message

In [5]:
print(raw_messages['message'][1])

Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the

## Extract the email contents

In [6]:
df_emails = raw_messages.copy()
messages = list(map(email.message_from_string, raw_messages['message']))
keys = messages[0].keys()
for key in keys:
    df_emails[key] = [doc[key] for doc in messages]
df_emails['content'] = list(map(get_text_from_email, messages))

In [7]:
df_emails['user'] = df_emails['file'].apply(lambda x: x.split('/')[0])
df_emails['folder'] = df_emails['file'].apply(lambda x: x.split('/')[1])
df_emails = df_emails.drop(['file', 'Mime-Version', 'Content-Type', 'message', 'X-Folder', 'X-FileName'],
                           axis=1)
df_emails = df_emails[['user', 'folder', 'Message-ID', 'From', 'To', 'Subject', 'content']]

In [8]:
df_emails.head()

Unnamed: 0,user,folder,Message-ID,From,To,Subject,content
0,allen-p,_sent_mail,<18782981.1075855378110.JavaMail.evans@thyme>,phillip.allen@enron.com,tim.belden@enron.com,,Here is our forecast\n\n
1,allen-p,_sent_mail,<15464986.1075855378456.JavaMail.evans@thyme>,phillip.allen@enron.com,john.lavorato@enron.com,Re:,Traveling to have a business meeting takes the...
2,allen-p,_sent_mail,<24216240.1075855687451.JavaMail.evans@thyme>,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,test successful. way to go!!!
3,allen-p,_sent_mail,<13505866.1075863688222.JavaMail.evans@thyme>,phillip.allen@enron.com,randall.gay@enron.com,,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p,_sent_mail,<30922949.1075863688243.JavaMail.evans@thyme>,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,Let's shoot for Tuesday at 11:45.


## Number of messages

In [9]:
print("# of messages:", df_emails.shape[0])

# of messages: 517401


### Drop duplicate messages (the same messages that are stored in multiple users' mailboxes)

In [10]:
df_emails = df_emails.drop_duplicates(subset='content').reset_index()
df_emails = df_emails.drop(['index'], axis=1)

In [11]:
print("# of messages:", df_emails.shape[0])

# of messages: 249025


In [12]:
print("THE CONTENT OF A MESSAGE:\n\n")
i = 18
print("Subject:", df_emails['Subject'][i])
print(df_emails['content'][i])

THE CONTENT OF A MESSAGE:


Subject: Westgate
---------------------- Forwarded by Phillip K Allen/HOU/ECT on 10/03/2000 
04:30 PM ---------------------------


"George Richards" <cbpres@austin.rr.com> on 10/03/2000 06:35:56 AM
Please respond to <cbpres@austin.rr.com>
To: "Phillip Allen" <pallen@enron.com>
cc: "Larry Lewter" <retwell@mail.sanmarcos.net> 
Subject: Westgate


Westgate

Enclosed are demographics on the Westgate site from Investor's Alliance.
Investor's Alliance says that these demographics are similar to the package
on San Marcos that you received earlier.
If there are any other questions or information requirements, let me know.
Then, let me know your interest level in the Westgate project?

San Marcos
The property across the street from the Sagewood units in San Marcos is for
sale and approved for 134 units.  The land is selling for $2.50 per square
foot as it is one of only two remaining approved multifamily parcels in West
San Marcos, which now has a moratorium on deve

## Extract the attachments from the email contents

In [13]:
attachments = {}
for i in range(df_emails.shape[0]):
    att = parse_message_attachment(df_emails['content'][i])
    if len(att) > 0:
        m_id = str(df_emails['Message-ID'][i])
        attachments[m_id] = att

In [14]:
df_attachments = pd.DataFrame(list(attachments.items()), columns=['Message-ID', 'attachment'])
df_attachments = pd.DataFrame({'Message-ID':np.repeat(df_attachments['Message-ID'].values,
                                                      df_attachments['attachment'].str.len()),
              'attachment':np.concatenate(df_attachments['attachment'].values)})

In [15]:
try:
    df_emails = df_emails.drop(['attachment'], axis=1)
except Exception as e:
    pass

df_emails = df_emails.merge(df_attachments, left_on='Message-ID', right_on='Message-ID',
                            how='inner')
attachment_counts = df_attachments['attachment'].value_counts()[(df_attachments['attachment'].value_counts()>1) & 
                                                                (df_attachments['attachment'].value_counts()<35)]
df_emails = df_emails[df_emails['attachment'].isin(list(attachment_counts.index))].reset_index()
df_emails = df_emails.drop(['index'], axis=1)
df_emails = df_emails.drop_duplicates()

## The obtained corpus stats

In [19]:
print("# of messages:           {} (some messages contains multiple attachments)".format(df_emails.shape[0]))
print("# of unique messages:    {}".format(len(df_emails['content'].unique())))
print("# of unique attachments: {}".format(len(df_emails['attachment'].unique())))

# of messages:           6162 (some messages contains multiple attachments)
# of unique messages:    4394
# of unique attachments: 1946


In [23]:
print("ATTACHMENT STATS:")
attachment_counts.describe()

ATTACHMENT STATS:


count    1946.000000
mean        3.455807
std         2.779164
min         2.000000
25%         2.000000
50%         2.000000
75%         4.000000
max        34.000000
Name: attachment, dtype: float64

In [58]:
df_emails[['From', 'To', 'Subject', 'content', 'attachment']].sort_values(by=['attachment']).head(5)

Unnamed: 0,From,To,Subject,content,attachment
4697,legal <.hall@enron.com>,"amr.ibrahim@enron.com, ray.alvarez@enron.com",RE: CalPX Collateral Issue,To the best of my knowledge there is no CAISO ...,#150515 v1 - CPS COMMENTS ON BIRCHMAN ORDER.doc
4693,ray.alvarez@enron.com,legal <.hall@enron.com>,CalPX Collateral Issue,"Thanks Steve. With your input, my take on Eli...",#150515 v1 - CPS COMMENTS ON BIRCHMAN ORDER.doc
4695,amr.ibrahim@enron.com,"ray.alvarez@enron.com, legal <.hall@enron.com>",RE: CalPX Collateral Issue,This is a quick note to let the team know that...,#150515 v1 - CPS COMMENTS ON BIRCHMAN ORDER.doc
4691,ray.alvarez@enron.com,legal <.hall@enron.com>,RE:,"Steve, I believe that schedule 2, section 2.2 ...",#150515 v1 - CPS COMMENTS ON BIRCHMAN ORDER.doc
4699,amr.ibrahim@enron.com,"legal <.hall@enron.com>, ray.alvarez@enron.com",RE: CalPX Collateral Issue,The preliminary research supports your conclus...,#150515 v1 - CPS COMMENTS ON BIRCHMAN ORDER.doc


## Let see the messages which have the attachment file:
       #631941 v2 - Draft letter from Enron to XYZ Corp. - January 20021.doc

In [48]:
df_att = df_emails[df_emails['attachment'].str.contains('#631941 v2 - Draft letter from Enron to XYZ')]
df_att = df_att[['Message-ID', 'user', 'From', 'To', 'Subject', 'content', 'attachment']]

In [59]:
print("The messages come from the conversation with the following subject:\n")
print("      'draft letter for calculation of settlement amount/methodology'")
for i in (df_att.index.tolist()):
    print("\n\n\n(A MESSAGE BEGINS)\n=====================================================")
    print("Message-ID:  {}".format(df_att['Message-ID'][int(i)]))
    print("Subject:     {}\n".format(df_att['Subject'][int(i)]))
    print(df_att['content'][int(i)])
    print(df_att['attachment'][int(i)])
    print("=====================================================")
    print("(END OF THE MESSAGE)")

The messages come from the conversation with the following subject:

      'draft letter for calculation of settlement amount/methodology'



(A MESSAGE BEGINS)
Message-ID:  <27056809.1075861059467.JavaMail.evans@thyme>
Subject:     RE: draft letter for calculation of settlement amount/methodology

thanks

are we keeping a log of these as we send them out -- Bob

 -----Original Message-----
From: 	Shackleton, Sara  
Sent:	Monday, February 25, 2002 11:21 AM
To:	Bruce, Robert
Subject:	draft letter for calculation of settlement amount/methodology

Bob:

The attached letter can be used for terminated or non-terminated parties; just tweak the letter.

 << File: #631941 v2 - Draft letter from Enron to XYZ Corp. - January 20021.doc >> 

Sara Shackleton
Enron Wholesale Services
1400 Smith Street, EB3801a
Houston, TX  77002
Ph:  (713) 853-5620
Fax: (713) 646-3490
#631941 v2 - Draft letter from Enron to XYZ Corp. - January 20021.doc
(END OF THE MESSAGE)



(A MESSAGE BEGINS)
Message-ID:  <141381