In [1]:
from __future__ import print_function, unicode_literals, division

from datetime import datetime
from email.parser import Parser
from glob import glob
import numpy as np
import pandas as pd
import string
from sys import exit


import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import sent_tokenize, word_tokenize
stop_words = set(stopwords.words('english'))



def clean_address_list(address_list):
    
    address_list = address_list.replace("\n", "")
    address_list = address_list.replace("\t", "")
    address_list = address_list.replace(" ", "")
    
    address_list = address_list.split(',')
    
    return address_list


def folder_to_df(target_folder):
    """ Load and parse all emails in the specified folder. """
    
    df = pd.DataFrame(columns=['to', 'cc', 'bcc', 'from', 'date', 'subject', 'body', 'reply', 'forward'])

    for person in glob("maildir/*")[:10]:

        for idx, folder in enumerate(glob("{}/{}".format(person, target_folder))):

            emails = glob("{}/*".format(folder))

            for email in emails:

                try:
                    content = parse_email(email)
                    
                    df = df.append(pd.Series(content),ignore_index=True)

                except UnicodeDecodeError as err:
                    print(err)
                    continue
                    
    return df


def clean_body(txt):
    
    txt = txt.replace('\n',' ')
    txt = txt.replace('\t',' ')
    
    return txt


def parse_date(datestr):
    
    return datetime.strptime(datestr[:-5].rstrip(), '%a, %d %b %Y %H:%M:%S %z')


def parse_email(message):
    """ Parse a single email. """
     
    try:
           
        with open(message, 'r') as f:
            raw = f.read()
        
    except UnicodeDecodeError as err:
        
        print(err)
        return None

    
    txt = Parser().parsestr(raw)
    
    email = {}

    field_to      = txt['to']
    field_cc      = txt['cc']
    field_bcc     = txt['bcc']
    field_from    = txt['from']
    field_subject = txt['subject']
    field_body    = txt.get_payload()
    field_date    = txt['date']
    
    
    if field_from:
        
        email_from = str(clean_address_list(field_from)[0])
        
        email['from'] = email_from
        
    else:
        
        email['from'] = None
        
    
    if field_to:
        
        email_to = clean_address_list(field_to)
        
        email['to'] = email_to
            
    else:
        
        email['to'] = None
            
            
    if field_cc:
        
        email_cc = clean_address_list(field_cc)
        
        email['cc'] = email_cc
            
    else:
        
        email['cc'] = None
        
            
    if field_bcc:
        
        email_bcc = clean_address_list(field_bcc)
        
        email['bcc'] = email_bcc
        
    else:
        
        email['bcc'] = None
        
        
    if field_date:
        
        email_date = parse_date(field_date)
        
        email['date'] = email_date
        
    else:
        
        email['date'] = None
        
        
    if field_subject:
        
        email_subject = field_subject
        
        email['subject'] = email_subject
        
        if 'Re:' in email_subject:
            
            email['reply'] = True
            
        else:
            
            email['reply'] = False
            
            
    if field_body:
        
        email_body = clean_body(field_body)
        
        email['body'] = email_body
        
        if '-- Forwarded' in email_body:
            
            email['forward'] = True
            
        else:
            
            email['forward'] = False
            
    else:
        
        email['body'] = None
            
    
    return email


def subject_analysis(subject):
    
    # Remove punctuation
    subject = subject.translate(str.maketrans('', '', string.punctuation))
    
    # Vader sentiment analysis using a pre-defined lexicon
    sia = SIA()
    results = []

    score = sia.polarity_scores(subject)
    print(score)

In [None]:
sent_df = folder_to_df('sent')

In [17]:
def body_analysis(body):
    """ Clean, tokenize, and analyze the body text. """
    
    # Tokenize body content into sentences
    sentences = sent_tokenize(body)

    # Remove punctuation
    sentences = [s.translate(str.maketrans('', '', string.punctuation)) for s in sentences]
    
    # Vader sentiment analysis using a pre-defined lexicon
    sia = SIA()
    results = []

    for line in sentences:
        print('\n')
        print(line)
        score = sia.polarity_scores(line)
        print(score)
    
    
    filtered_sentences = []
    
    for s in sentences:
        
        # Tokenize sentences into words
        words = word_tokenize(s)
        
        # Remove stop words
        cleaned = [w for w in words if not w in stop_words] 
        
        filtered_sentences.append(cleaned)
        
       
    return filtered_sentences
    
    

for idx, row in sent_df.iterrows():
    
    print(row['subject'])
    subject_analysis(row['subject'])
    print('\n\n')
    
    # Get body text
    text = row['body']
    
    print(text)
    print('\n\n')
    
    body_analysis(row['body'])
    
    break

Astros Game
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}



Ladies and gentlemen of Class #64,  Jeff Skilling has reserved Drayton McLane's suite at Enron Field for the  October 1 (last game of the regular season) Astros game against the  Milwaukee, and he would like you to be his guests.  First pitch is scheduled  at 7:05p.  Please let me know if you are able to join him.  If you do plan to attend,  please let me know the most convenient way to get a ticket to you.  Don't hesitate to call me should you have any questions.  I look forward to a  positive response.  Regards, Sherri Sera Assistant to Jeff Skilling 713.853.5984 713.646.8381 (fax) sherri.sera@enron.com  PS - if there is anyone from Class #64 that is not on this e-mail  distribution, please forward a copy to them.





Ladies and gentlemen of Class 64  Jeff Skilling has reserved Drayton McLanes suite at Enron Field for the  October 1 last game of the regular season Astros game against the  Milwaukee and he would lik