In [1]:
from __future__ import print_function, unicode_literals, division

from datetime import datetime
from email.parser import Parser
from glob import glob
import numpy as np
import pandas as pd
import string
from sys import exit


import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import sent_tokenize, word_tokenize
stop_words = set(stopwords.words('english'))


def body_analysis(body):
    """ Clean, tokenize, and analyze the body text. """
    
    # Tokenize body content into sentences
    sentences = sent_tokenize(body)

    # Remove punctuation
    sentences = [s.translate(str.maketrans('', '', string.punctuation)) for s in sentences]
    
    # Remove stopwords
    filtered_sentences = []
    
    for s in sentences:
        
        # Tokenize sentences into words
        words = word_tokenize(s)
        
        # Remove stop words
        cleaned = [w for w in words if not w in stop_words] 
        
        cleaned = ' '.join(cleaned)
        
        filtered_sentences.append(cleaned)
    
    # Vader sentiment analysis using a pre-defined lexicon
    sia = SIA()
    
    results = {
        'neg': [],
        'neu': [],
        'pos': [],
        'compound': []
    }

    for line in filtered_sentences:
        
        score = sia.polarity_scores(line)
        
        for key in score.keys():
            results[key].append(float(score[key]))
            
            
    mean_neg  = np.mean(results['neg'])
    mean_neu  = np.mean(results['neu'])
    mean_pos  = np.mean(results['pos'])
    mean_comp = np.mean(results['compound'])
        
       
    return {'neg': mean_neg, 'neu': mean_neu, 'pos': mean_pos, 'compound': mean_comp}


def clean_address_list(address_list):
    
    address_list = address_list.replace("\n", "")
    address_list = address_list.replace("\t", "")
    address_list = address_list.replace(" ", "")
    
    address_list = address_list.split(',')
    
    return address_list


def clean_body(txt):
    
    tmp = txt.split('\n')
    
    lines = []
    
    for line in tmp:
        
        if '- Forwarded by' in line: break
            
        lines.append(line)
        
    txt = '\n'.join(lines)
    
    txt = txt.replace('\n',' ')
    txt = txt.replace('\t',' ')
    
    return txt


def folder_to_df(target_folder):
    """ Load and parse all emails in the specified folder. """
    
    df = pd.DataFrame(columns=['to', 'cc', 'bcc', 'from', 'date', 'subject', 'body', 'reply', 'forward'])

    for person in glob("maildir/*")[:10]:

        for idx, folder in enumerate(glob("{}/{}".format(person, target_folder))):

            emails = glob("{}/*".format(folder))

            for email in emails:

                try:
                    content = parse_email(email)
                    
                    df = df.append(pd.Series(content),ignore_index=True)

                except UnicodeDecodeError as err:
                    print(err)
                    continue
                    
    return df


def parse_date(datestr):
    
    return datetime.strptime(datestr[:-5].rstrip(), '%a, %d %b %Y %H:%M:%S %z')


def parse_email(message):
    """ Parse a single email. """
     
    try:
           
        with open(message, 'r') as f:
            raw = f.read()
        
    except UnicodeDecodeError as err:
        
        print(err)
        return None
    
    
    keys = ['from', 'to', 'cc', 'bcc', 'date', 'subject', 'body',
            'x-from', 'x-to', 'x-cc', 'x-bcc', 'x-folder', 'x-origin',
            'subject_neg', 'subject_neu', 'subject_pos', 'subject_comp',
            'body_neg', 'body_neu', 'body_pos', 'body_comp']
    
    email = {}
    
    for key in keys:
        email[key] = None

    
    txt = Parser().parsestr(raw)

    field_to      = txt['to']
    field_cc      = txt['cc']
    field_bcc     = txt['bcc']
    field_from    = txt['from']
    field_subject = txt['subject']
    field_body    = txt.get_payload()
    field_date    = txt['date']
    
    X_from        = txt['x-from']
    X_to          = txt['x-to']
    X_cc          = txt['x-cc']
    X_bcc         = txt['x-bcc']
    X_folder      = txt['x-folder']
    X_origin      = txt['x-origin']
    
    
    if field_from:
        
        email_from = str(clean_address_list(field_from)[0])
        
        email['from'] = email_from
        
    
    if field_to:
        
        email_to = clean_address_list(field_to)
        
        email['to'] = email_to
    
            
    if field_cc:
        
        email_cc = clean_address_list(field_cc)
        
        email['cc'] = email_cc
        
            
    if field_bcc:
        
        email_bcc = clean_address_list(field_bcc)
        
        email['bcc'] = email_bcc
        
        
    if field_date:
        
        email_date = parse_date(field_date)
        
        email['date'] = email_date
        
        
    if field_subject:
        
        email_subject = field_subject
        
        email['subject'] = email_subject
        
        if 'Re:' in email_subject:
            
            email['reply'] = True
            
        else:
            
            email['reply'] = False
            
        # Sentiment analysis
        score = subject_analysis(email['subject'])
        
        email['subject_neg'] = score['neg']
        email['subject_neu'] = score['neu']
        email['subject_pos'] = score['pos']
        email['subject_comp'] = score['compound']
            
            
    if field_body:
        
        email_body = clean_body(field_body)
        
        email['body'] = email_body
        
        if '-- Forwarded' in email_body:
            
            email['forward'] = True
            
        else:
            
            email['forward'] = False
            
        # Sentiment analysis
        score = body_analysis(email['body'])
        
        email['body_neg'] = score['neg']
        email['body_neu'] = score['neu']
        email['body_pos'] = score['pos']
        email['body_comp'] = score['compound']
            
            
    email['x-from']   = X_from
    email['x-to']     = X_to
    email['x-cc']     = X_cc
    email['x-bcc']    = X_bcc
    if X_folder: email['x-folder'] = parse_folder(X_folder)
    email['x-origin'] = X_origin
            
    
    return email


def parse_folder(folder_path):
    
    folder = folder_path.split('\\')[-1]
    return folder.replace(' ','_')


def subject_analysis(subject):
    
    # Remove punctuation
    subject = subject.translate(str.maketrans('', '', string.punctuation))
    
    # Vader sentiment analysis using a pre-defined lexicon
    sia = SIA()
    results = []

    return sia.polarity_scores(subject)
    

In [2]:
sent_df = folder_to_df('sent')

print(sent_df)

sent_df.to_csv('sent.csv')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


                                                     to  \
0                              [scrapmet@ix.netcom.com]   
1                               [gailg@morgangroup.com]   
2                              [amelia.alder@enron.com]   
3                             [stanmarek@marekbros.com]   
4                         [vanessa.groscrand@enron.com]   
5                              [david.porter@enron.com]   
6                               [ssoles@ziffenergy.com]   
7                               [sherri.sera@enron.com]   
8                              [ranabir.dutt@enron.com]   
9     [kenneth.lay@enron.com, cliff.baxter@enron.com...   
10                                 [ddale@vignette.com]   
11                         [ronald_ridlehuber@agfg.com]   
12                             [kevin.hannon@enron.com]   
13                          [craig_shackleton@amat.com]   
14                                  [kcompton@kpcb.com]   
15                             [maktay@superonline.com] 

In [3]:
df = sent_df[sent_df['body_comp'] < 0]

for idx, row in df.iterrows():
    
    # Get body text
    text = row['body']
    
    # Get compound SA score
    score = row['body_comp']
    
    print(score)
    print(text)
    print('\n\n')

-0.004428571428571433
No problem.     Paul Murphy <pmurphy@swbanktx.com> on 03/26/2001 05:45:15 PM To: "'jskilli@enron.com'" <jskilli@enron.com> cc:    Subject: Late     I am in traffic.? 5-10 mins late  -------------------------    CONFIDENTIALITY NOTICE:  ************************************************************************  The information contained in this ELECTRONIC MAIL transmission is confidential.  It may also be privileged work product or proprietary information. This information is intended for the exclusive use of the addressee(s).  If you are not the intended recipient, you are hereby notified that any use, disclosure, dissemination, distribution [other than to the addressee(s)], copying or taking of any action because of this information is strictly prohibited.  ************************************************************************   



-0.040100000000000004
John, any chance you can sent this e-mail to me?  Apparently it has a  "confidential" stamp on it, so even tho