In [1]:
import numpy as np
import pandas as pd

Used ENRON Email Dataset: https://www.kaggle.com/wcukierski/enron-email-dataset
<br>Code from (Part 1 & Part 2): https://towardsdatascience.com/how-i-used-machine-learning-to-classify-emails-and-turn-them-into-insights-efed37c1e66

In [2]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return emails

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel

import pandas as pd
def read_email_bodies():
    emails = pd.read_csv('emails.csv')
    email_df = pd.DataFrame(parse_into_emails(emails.head(100000).message))
#     email_df.drop(email_df.query("body == '' | to == '' | from == ''").index, inplace=True)
#     email_df.drop_duplicates(inplace=True)
    return email_df['body']

def read_emails():
    emails = pd.read_csv('emails.csv')
    email_df = pd.DataFrame(parse_into_emails(emails.head(100000).message))
    return email_df

class EmailDataset: 
     def __init__(self):
        stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
        self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
        self.emails = read_email_bodies() 
        # train on the given email data.
        self.train()
  
     def train(self):
        self.vec_train = self.vec.fit_transform(self.emails)
  
     def query(self, keyword, limit):
        vec_keyword = self.vec.transform([keyword])
        cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten()
        related_email_indices = cosine_sim.argsort()[::-1]
        return related_email_indices

     def find_email_by_index(self, i):
        return self.emails[i]

In [4]:
ds = EmailDataset()
keywords1 = 'meeting schedule today tomorrow pm am meet lunch dinner next week Monday Tuesday Wednesday Thursday Friday Saturday Sunday'
keywords = ['mon', 'tue', 'wed', 'thur', 'fri', 'schedule', \
				'sat', 'sun', 'tomorrow', 'week', 'today', 'month', \
				'morning', 'night', 'lunch', 'meeting', 'year', 'next', \
				'evening', 'pm', 'am', 'p.m.', 'a.m.', 'dinner', 'breakfast', \
				'conference', 'time', 'agenda', 'appointment', \
				'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', \
				'sep', 'oct', 'nov', 'dec', 'date', 'calender']
str_keywords = ' '.join(keywords)
print(str_keywords)
results = ds.query(str_keywords, 100)
# print out the first result.
print(results[0])
print(ds.find_email_by_index(results[0]))
print("")
print(results[50000])
print(ds.find_email_by_index(results[198]))

mon tue wed thur fri schedule sat sun tomorrow week today month morning night lunch meeting year next evening pm am p.m. a.m. dinner breakfast conference time agenda appointment jan feb mar apr may jun jul aug sep oct nov dec date calender
80883
in office mon, tues, wed.  perhaps houston thur/fri.

52180
a) East Midstream and QF Restructuring - Aug 15;b) Industrials - Aug 14;c) Canada Origination - Aug 16;d) West Midstream - Aug 17;e) HPL, LRC and Upstream Origination - Aug 28.We are setting up the remainder as we speak.  I have some final comments onthe packages you gave me; however, the color printing will not fax so we needto find time to discuss.Please fax the Friday "package" and the final Q2 numbers.RegardsDelainey


In [5]:
new_df = read_emails()
new_df['is_useful'] = 1
new_df.loc[results[:70000], "is_useful"] = 1
new_df.loc[results[30000:], "is_useful"] = 0
new_df.head(30)

Unnamed: 0,from,to,subject,body,is_useful
0,phillip.allen@enron.com,tim.belden@enron.com,,Here is our forecast,0
1,phillip.allen@enron.com,john.lavorato@enron.com,Re,Traveling to have a business meeting takes the...,0
2,phillip.allen@enron.com,leah.arsdall@enron.com,Re,test successful. way to go!!!,0
3,phillip.allen@enron.com,randall.gay@enron.com,,"Randy,Can you send me a schedule of the salary...",1
4,phillip.allen@enron.com,greg.piper@enron.com,Re,,0
5,phillip.allen@enron.com,greg.piper@enron.com,Re,"Greg,How about either next Tuesday or Thursday...",0
6,phillip.allen@enron.com,"david.l.johnson@enron.com, john.shafer@enron.com",,Phillip Allen (pallen@enron.com)Mike Grigsby (...,0
7,phillip.allen@enron.com,joyce.teixeira@enron.com,Re,,0
8,phillip.allen@enron.com,mark.scott@enron.com,Re,I don't think these are required by the ISP2. ...,0
9,phillip.allen@enron.com,"""'Pallen@Enron.com'"" <Pallen@Enron.com>",FW,---------------------- Forwarded by Phillip K ...,0


In [6]:
new_df.to_csv(r'labeled_data_100k.csv', index = False)
# new_df.shape