## Analysis3: Identify key conversations related to Fraud and Corporate Crime by looking for specific keywords

#### Step 0: Identify risky words

In [1]:
# A pool of words that are considered risky
# because they were either associated with misleading financial statements
# or they were associated with insider trading
risky_words = ['bankruptcy', 'bankrupt', 'stocks', 'shares', 'audit', 'enrn', 'ene', 'nyse', 'nasdaq'
               '401k', 'offshore', 'early', 'notice', 'billion', 'million', 'price', 'warning', 
              'overseas', 'international', 'mark-to-market', 'MTM', 'lawsuit', 'account', 'accounts',
              'shareholder', 'shareholders', 'forum', 'violation']

#### Step 1: Import modules

In [2]:
from email.parser import Parser
import glob
import os
import datetime
import traceback

#### Step 2: Identify directories

In [3]:
# Path to the Enron dataset
path_to_maildir = 'C:/Users/parit/midterm/data/enron/maildir'

In [4]:
# Path to primary convicts mailboxes (Andrew Fastow's mailbox is not present in the dataset)
path_to_kenneth_lay = path_to_maildir + '/lay-k'
path_to_jeff_skilling = path_to_maildir + '/skilling-j'


#### Step 3: Define processing logic

In [5]:
# Function to recursively read mails from folders and sub-folders
def processEmails(path):
    peak_mail_words = []
    dwnfall_mail_words = []
    inner_peak = []
    inner_dwnfall = []
    pathContent = glob.glob(path + '/*')    
    for pathName in pathContent:
        if os.path.isdir(pathName):    
            (inner_peak, inner_dwnfall) = processEmails(pathName)
            peak_mail_words = peak_mail_words + inner_peak
            dwnfall_mail_words = dwnfall_mail_words + inner_dwnfall
        else:
            try:
                with open(pathName, 'r') as filename:
                    email = Parser().parsestr(filename.read())                    
                    mail_date = email['date']
                    mail_subject = email['subject'].split()
                    mail_content = email.get_payload().split()                    
                    mail_words = mail_content + mail_subject
                    mail_date = datetime.datetime.strptime(mail_date[:-6],"%a, %d %b %Y %H:%M:%S %z")
                    if mail_date.year in range(1997,2001):                        
                        peak_mail_words = [r for r in mail_words if str.lower(r) in risky_words]
                    elif mail_date.year == 2001:
                        dwnfall_mail_words = [r for r in mail_words if str.lower(r) in risky_words]
            except:
                traceback.print_exc()
                #print('File open error due to foreign characters:', pathName)
    return (peak_mail_words, dwnfall_mail_words)

#### Step 4: Invoke email processing for Kenneth Lay

In [6]:
# Invoke processEmails() for Kenneth Lay
(before, after) = processEmails(path_to_kenneth_lay)
print('Risky words for Kenneth Lay in first phase:', before)
print('Risky words for Kenneth Lay in second phase:', after)

Risky words for Kenneth Lay in first phase: ['price', 'price', 'million', 'forum', 'early', 'early', 'million', 'account', 'million', 'million', 'million']
Risky words for Kenneth Lay in second phase: ['International', 'price', 'Price', 'price', 'price', 'price', 'price', 'price', 'bankruptcy', 'bankruptcy', 'bankruptcy', 'bankruptcy', 'price']


#### Step 5: Invoke email processing for Jeff Skilling


In [7]:
# Invoke processEmails() for Jeff Skilling
(before, after) = processEmails(path_to_jeff_skilling)
print('Risky words for Jeff Skilling in first phase:', before)
print('Risky words for Jeff Skilling in second phase:', after)

Risky words for Jeff Skilling in first phase: ['million']
Risky words for Jeff Skilling in second phase: ['international', 'INTERNATIONAL', 'International', 'FORUM', 'forum']


### Conclusion for Analysis 3: 

> Judging by the participation levels of Kenneth Lay during both phases, we can say that he was the central piece to the whole puzzle. While Jeff Skilling was primarily involved in spreading fraudulent activities outside of the US.