# Data preprocessing

In [1]:
# Essential setup: Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download the data

In [2]:
import requests

url = 'https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz'
response = requests.get(url)

with open('enron_mail_20150507.tar.gz', 'wb') as file:
    file.write(response.content)


Extract the data

In [3]:
import tarfile

# Extract the file
with tarfile.open('enron_mail_20150507.tar.gz', 'r:gz') as tar:
    tar.extractall('enron_data')


The Enron email dataset includes around 500,000 emails from 150 employees of Enron, collected during the investigation into the company's collapse. It's organized into folders and files to reflect the structure of the email accounts of various employees. Here's a breakdown of the structure:



1. Top-Level Folders: Each top-level folder represents an individual employee's email account. For example, folders like allen-p, badeer-r, bailey-s, etc., correspond to different employees.

2. Subfolders: Within each employee's folder, there are subfolders that represent different email categories or folders within their email account. Common subfolders include:

  * inbox: Contains received emails.

  * sent_items: Contains sent emails.

  * deleted_items: Contains deleted emails.

  * all_documents: Contains all documents related to the employee.

  * notes_inbox: Contains notes received.

  * calendar: Contains calendar entries.

  * contacts: Contains contact information.

3. Email Files: Each subfolder contains individual email files, typically in plain text format. These files represent individual email messages.

In [4]:
import os
import pandas as pd

data = []

# Traverse the extracted directory and read email files
for root, dirs, files in os.walk('enron_data/maildir'):
    for file in files:
          file_path = os.path.join(root, file)
          with open(file_path, 'r', encoding='latin1') as f:
              content = f.read()
              data.append(content)

# Create a dataframe from the collected email contents
df = pd.DataFrame(data, columns=['email_content'])
print(df.head())


                                       email_content
0  Message-ID: <5633964.1075841410781.JavaMail.ev...
1  Message-ID: <369679.1075841410758.JavaMail.eva...
2  Message-ID: <18938798.1075841410826.JavaMail.e...
3  Message-ID: <7037757.1075841410849.JavaMail.ev...
4  Message-ID: <32990561.1075841410804.JavaMail.e...


In [5]:
df.shape

(517401, 1)

In [6]:
print(df.iloc[55,0])

Message-ID: <16472585.1075841406361.JavaMail.evans@thyme>
Date: Tue, 29 Jan 2002 07:37:06 -0800 (PST)
From: chris.gaskill@enron.com
To: biliana.pehlivanova@enron.com, bart.burk@enron.com, 
	justin.o'malley@enron.com, adam.bayer@enron.com, 
	vladi.pimenov@enron.com, jozef.lieskovsky@enron.com, 
	dipak.agarwalla@enron.com
Subject: Sachin
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Gaskill, Chris </O=ENRON/OU=NA/CN=RECIPIENTS/CN=CGASKILL>
X-To: Pehlivanova, Biliana </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Bpehliva>, Burk, Bart </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Bburk>, O'Malley, Justin </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jomalley>, Bayer, Adam </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Abayer>, Pimenov, Vladi </O=ENRON/OU=NA/CN=RECIPIENTS/CN=VPIMENOV>, Lieskovsky, Jozef </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Jlieskov>, Agarwalla, Dipak </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Dagarwal>
X-cc: 
X-bcc: 
X-Folder: \ExMerge - Pimenov, Vladi\Inbox
X-Origin: PIMENOV-V
X-FileN

**Information Part**

Let's seperate the "info" and "content" parts, and deal with the information part first.

In [7]:
def info_part(i):
  return i.split('\n\n', 1)[0]

def content_part(i):
  return i.split('\n\n', 1)[1]

#test
info=info_part(df.iloc[12,0])
content=content_part(df.iloc[44,0])

print(f'info: \n{info}')
print('-----------------------')
print(f'content: \n{content}')

info: 
Message-ID: <33077006.1075841409258.JavaMail.evans@thyme>
Date: Tue, 13 Nov 2001 16:21:44 -0800 (PST)
From: chairman.ken@enron.com
To: dl-ga-all_enron_worldwide5@enron.com
Subject: Change of Control Provisions
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Ken Lay - Office of the Chairman </O=ENRON/OU=NA/CN=RECIPIENTS/CN=MBX_KLAYOFFICECHAIR>
X-To: DL-GA-ALL_enron_worldwide5 </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DL-GA-ALL-5>
X-cc: 
X-bcc: 
X-Folder: \ExMerge - Pimenov, Vladi\Inbox
X-Origin: PIMENOV-V
X-FileName: vladi pimenov 6-26-02.PST
-----------------------
content: 
Please send me the time you have worked for the first of the month so that I can fill out your time sheet. 

Thank you, 

Amanda Rybarski
Gas Fundamentals 
Office: (713) 853-4384
Fax: (713) 646-8453
Pager: (877) 482-0772
Cell: (713) 560-0934



In [8]:
new_df=pd.DataFrame()
new_df['info']= df.email_content.map(info_part)
new_df['content']=df.email_content.map(content_part)

new_df.head()

Unnamed: 0,info,content
0,Message-ID: <5633964.1075841410781.JavaMail.ev...,"ok, whatever"
1,Message-ID: <369679.1075841410758.JavaMail.eva...,Bush names businessmen to Pentagon civilian po...
2,Message-ID: <18938798.1075841410826.JavaMail.e...,"Patrick,\n\nwith regard to the deal we did on ..."
3,Message-ID: <7037757.1075841410849.JavaMail.ev...,"Patrick,\n\nwe fixed the deal. Thank you for b..."
4,Message-ID: <32990561.1075841410804.JavaMail.e...,"Rogers Herndon - eto tot muzhiuk, kotoriy el d..."


In [9]:
words2split = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 'Mime-Version: ', 'Content-Type: ',
               'Content-Transfer-Encoding: ', 'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 'X-Folder: ', 'X-Origin: ',
               'X-FileName: ']
features_naming = [i[:-2] for i in words2split]
split_condition = '|'.join(words2split)

In [10]:
split_condition

'Message-ID: |Date: |From: |To: |Subject: |Cc: |Mime-Version: |Content-Type: |Content-Transfer-Encoding: |Bcc: |X-From: |X-To: |X-cc: |X-bcc: |X-Folder: |X-Origin: |X-FileName: '

In [11]:
features_naming

['Message-ID',
 'Date',
 'From',
 'To',
 'Subject',
 'Cc',
 'Mime-Version',
 'Content-Type',
 'Content-Transfer-Encoding',
 'Bcc',
 'X-From',
 'X-To',
 'X-cc',
 'X-bcc',
 'X-Folder',
 'X-Origin',
 'X-FileName']

In [12]:
import re

# let's check how many categories are there in these emails
# Each email includes how many parts of those listed parts
def num_part(i):
    return len(re.split(split_condition, i))
new_df['num_info'] = new_df['info'].map(num_part)

In [13]:
new_df['num_info'].value_counts()

Unnamed: 0_level_0,count
num_info,Unnamed: 1_level_1
16,369198
18,119028
15,20212
19,7389
17,1571
5,2
6,1


So there are so many emails which don't have all parts of info!

In [14]:
print(new_df.loc[new_df['num_info']==6]['info'])

510663    Message-ID: <21985829.1075858967322.JavaMail.e...
Name: info, dtype: object


In [15]:
# Some emails' subject confuse the string-spliting function, so I make a little change
def duplicated_info(i):
    return i.replace(' Date: ', ' Date- ').replace(' Subject: ', ' Subject2: ').replace(' To: ',
                    ' To- ').replace(' (Subject: ', ' (Subject- ')
new_df['info'] = new_df['info'].map(duplicated_info)

In [16]:
new_df['num_info'] = new_df['info'].map(num_part)
new_df['num_info'].value_counts()

Unnamed: 0_level_0,count
num_info,Unnamed: 1_level_1
16,369135
18,126416
15,20380
17,1467
5,2
6,1


In [17]:
# some emails dont have the "Cc:" and "Bcc:" categories, so we add!
temp_condition = (new_df['num_info'] == 16) | (new_df['num_info'] == 15)
def add_bcc(i):
    return i.replace('\nX-From: ', '\nBcc: \nX-From: ')
new_df.loc[temp_condition, 'info'] = new_df.loc[temp_condition, 'info'].map(add_bcc)
def add_cc(i):
    return i.replace('\nMime-Version: ', '\nCc: \nMime-Version: ')
new_df.loc[temp_condition, 'info'] = new_df.loc[temp_condition, 'info'].map(add_cc)

#test
new_df['num_info'] = new_df['info'].map(num_part)
new_df['num_info'].value_counts()

Unnamed: 0_level_0,count
num_info,Unnamed: 1_level_1
18,495551
17,21847
5,2
6,1


In [18]:
# around 20k emails do not have the 'To: ' category, so I add one
def add_to(i):
    return i.replace('\nSubject: ', '\nTo: \nSubject: ')
temp_condition = (new_df['num_info'] == 17)
new_df.loc[temp_condition, 'info'] = new_df.loc[temp_condition, 'info'].map(add_to)

#test
new_df['num_info'] = new_df['info'].map(num_part)
new_df['num_info'].value_counts()

Unnamed: 0_level_0,count
num_info,Unnamed: 1_level_1
18,517398
5,2
6,1


There are still three wrong formatted email! we easily delete them.

In [19]:
print(new_df[new_df['num_info'] <= 6])
new_df.drop(new_df[new_df['num_info'] <= 6].index, inplace=True)

                                                     info  \
172470  Message-ID: <15722007.1075840335489.JavaMail.e...   
173004  Message-ID: <20932070.1075855140688.JavaMail.e...   
510663  Message-ID: <21985829.1075858967322.JavaMail.e...   

                                                  content  num_info  
172470  \n12.\n13.\nMime-Version: 1.0\nContent-Type: t...         5  
173004  \n12.\n13.\nMime-Version: 1.0\nContent-Type: t...         5  
510663  in\n Conference Room EB3872 \n\nto dicuss tari...         6  


In [20]:
global feature_idx
def info_split(i):
    ## split the i th part out and remove \n for the feature
    return re.split(split_condition, i)[feature_idx+1][:-2]
def info_split_last(i):
    ## no need to remove \n for last category -- X-FileName
    return re.split(split_condition, i)[feature_idx+1]
for feature_idx in range(len(words2split)):
    if feature_idx != len(words2split) - 1:
        new_df[features_naming[feature_idx]] = new_df['info'].map(info_split)
    else:
        new_df[features_naming[feature_idx]] = new_df['info'].map(info_split_last)

In [21]:
new_df.head()

Unnamed: 0,info,content,num_info,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName
0,Message-ID: <5633964.1075841410781.JavaMail.ev...,"ok, whatever",18,<5633964.1075841410781.JavaMail.evans@thyme,"Tue, 24 Apr 2001 19:24:00 -0700 (PDT",vladi.pimenov@enron.co,natalie.bondareva@enron.co,Re: go EES!!,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Natalie Bondareva <Natalie Bondareva/HOU/EES@EES,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
1,Message-ID: <369679.1075841410758.JavaMail.eva...,Bush names businessmen to Pentagon civilian po...,18,<369679.1075841410758.JavaMail.evans@thyme,"Tue, 24 Apr 2001 19:09:00 -0700 (PDT",vladi.pimenov@enron.co,nbondare@enron.co,go EES!!,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,nbondare <nbondare@enron.com,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
2,Message-ID: <18938798.1075841410826.JavaMail.e...,"Patrick,\n\nwith regard to the deal we did on ...",18,<18938798.1075841410826.JavaMail.evans@thyme,"Mon, 14 May 2001 14:19:00 -0700 (PDT",vladi.pimenov@enron.co,patrick.mulvany@enron.co,Re: physical trad,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Patrick Mulvany <Patrick Mulvany/ENRON@enronXgate,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
3,Message-ID: <7037757.1075841410849.JavaMail.ev...,"Patrick,\n\nwe fixed the deal. Thank you for b...",18,<7037757.1075841410849.JavaMail.evans@thyme,"Wed, 16 May 2001 18:28:00 -0700 (PDT",vladi.pimenov@enron.co,patrick.mulvany@enron.co,Re: physical deal #78391,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Patrick Mulvany <Patrick Mulvany/ENRON@enronXgate,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
4,Message-ID: <32990561.1075841410804.JavaMail.e...,"Rogers Herndon - eto tot muzhiuk, kotoriy el d...",18,<32990561.1075841410804.JavaMail.evans@thyme,"Fri, 4 May 2001 11:37:00 -0700 (PDT",vladi.pimenov@enron.co,nbondare@enron.co,Organization Announcemen,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,nbondare <nbondare@enron.com,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST


In [22]:
new_df.drop(['num_info', 'info'], axis=1, inplace= True)
new_df.head()

Unnamed: 0,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName
0,"ok, whatever",<5633964.1075841410781.JavaMail.evans@thyme,"Tue, 24 Apr 2001 19:24:00 -0700 (PDT",vladi.pimenov@enron.co,natalie.bondareva@enron.co,Re: go EES!!,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Natalie Bondareva <Natalie Bondareva/HOU/EES@EES,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
1,Bush names businessmen to Pentagon civilian po...,<369679.1075841410758.JavaMail.evans@thyme,"Tue, 24 Apr 2001 19:09:00 -0700 (PDT",vladi.pimenov@enron.co,nbondare@enron.co,go EES!!,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,nbondare <nbondare@enron.com,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
2,"Patrick,\n\nwith regard to the deal we did on ...",<18938798.1075841410826.JavaMail.evans@thyme,"Mon, 14 May 2001 14:19:00 -0700 (PDT",vladi.pimenov@enron.co,patrick.mulvany@enron.co,Re: physical trad,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Patrick Mulvany <Patrick Mulvany/ENRON@enronXgate,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
3,"Patrick,\n\nwe fixed the deal. Thank you for b...",<7037757.1075841410849.JavaMail.evans@thyme,"Wed, 16 May 2001 18:28:00 -0700 (PDT",vladi.pimenov@enron.co,patrick.mulvany@enron.co,Re: physical deal #78391,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,Patrick Mulvany <Patrick Mulvany/ENRON@enronXgate,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST
4,"Rogers Herndon - eto tot muzhiuk, kotoriy el d...",<32990561.1075841410804.JavaMail.evans@thyme,"Fri, 4 May 2001 11:37:00 -0700 (PDT",vladi.pimenov@enron.co,nbondare@enron.co,Organization Announcemen,,1.0,text/plain; charset=us-asci,7bi,,Vladi Pimeno,nbondare <nbondare@enron.com,,,"\ExMerge - Pimenov, Vladi\'Sent Mai",PIMENOV-,vladi pimenov 6-26-02.PST


In [23]:
import os

# Define a correct path
directory = "/content/drive/MyDrive/fraudulent/enron_email"
os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist

# Save the file
new_df.to_csv(os.path.join(directory, "emails_cleaned.csv"))
