<a href="https://colab.research.google.com/github/marcoullmann/enron/blob/master/enron_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data downloaded from https://www.kaggle.com/wcukierski/enron-email-dataset

Data preprocessing taken form: https://www.kaggle.com/zichen/explore-enron#1.-Loading-and-cleaning-data

In [0]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/Colab Notebooks/enron/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/enron


In [0]:
import os, sys, email, re
import numpy as np 
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set_style('whitegrid')
#import plotly
#plotly.offline.init_notebook_mode()
#import plotly.graph_objs as go
import wordcloud

# Network analysis
import networkx as nx
# NLP
import nltk
nltk.download('stopwords')
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

import glob

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Read the data into a DataFrame
emails_df = pd.read_csv('./data/emails.csv')
print(emails_df.shape)
emails_df.head()

(517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [0]:
# A single message looks like this
print(emails_df['message'][0])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


In [0]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = set(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [0]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head()

Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,allen-p/_sent_mail/1.,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",{phillip.allen@enron.com},{tim.belden@enron.com},,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,allen-p/_sent_mail/10.,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",{phillip.allen@enron.com},{john.lavorato@enron.com},Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,allen-p/_sent_mail/100.,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",{phillip.allen@enron.com},{leah.arsdall@enron.com},Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,allen-p/_sent_mail/1000.,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",{phillip.allen@enron.com},{randall.gay@enron.com},,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,allen-p/_sent_mail/1001.,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",{phillip.allen@enron.com},{greg.piper@enron.com},Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


In [0]:
print('shape of the dataframe:', emails_df.shape)
# Find number of unique values in each columns
for col in emails_df.columns:
    print(col, emails_df[col].nunique())

shape of the dataframe: (517401, 18)
file 517401
Message-ID 517401
Date 224128
From 20328
To 54748
Subject 159290
Mime-Version 1
Content-Type 2
Content-Transfer-Encoding 3
X-From 27980
X-To 73552
X-cc 33701
X-bcc 132
X-Folder 5335
X-Origin 259
X-FileName 429
content 249025
user 150


In [0]:
# Set index and drop columns with two few values
emails_df = emails_df.set_index('Message-ID')\
    .drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
# Parse datetime
emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
emails_df.dtypes

Date          object
From          object
To            object
Subject       object
X-From        object
X-To          object
X-cc          object
X-bcc         object
X-Folder      object
X-Origin      object
X-FileName    object
content       object
user          object
dtype: object

In [0]:
def normalize(text):
  text = clean_text(text)
  tokens = tokenize(text)
  norm = remove_stopwords(tokens)
  return norm

def tokenize(text):
  words = gensim.utils.simple_preprocess(str(text), deacc=True)
  return words

def remove_stopwords(tokens):
  eng_stopwords = set(stopwords.words('english'))
  eng_stopwords.update(("www","com","image", "hou","ect", "pm", "am", "to","cc","subject","http","from","sent","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
  non_stop_words = [word for word in tokens if(word not in eng_stopwords)]
  return non_stop_words

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.strip()
    text = text.lower()
    return text

def get_trigrams(words):
    return trigram_mod[bigram_mod[words]]

def lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(words)) 
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return texts_out

In [0]:
# See trigram example
print(trigram_mod[bigram_mod[emails_df['norm_content'][1012]]])

['jacques', 'george', 'finally', 'information', 'please', 'look', 'email', 'wants', 'us', 'buy', 'keith', 'think', 'joke', 'still', 'need', 'speak', 'engineer', 'find', 'soil', 'study', 'determine', 'value', 'going', 'forward', 'believe', 'architect', 'work', 'use', 'us', 'think', 'deserve', 'compensation', 'time', 'due', 'fact', 'intentional', 'project', 'proposing', 'unsupportable', 'market', 'version', 'buyout', 'attached', 'need', 'expert', 'advise', 'ready', 'offer', 'version', 'threaten', 'foreclose', 'case', 'due', 'money', 'time', 'since', 'cost', 'fees', 'hold', 'versus', 'market', 'execute', 'contract', 'think', 'would', 'stand', 'chance', 'time', 'waste', 'want', 'respond', 'offer', 'asap', 'call', 'thoughts', 'phillip', 'forwarded', 'phillip_allen', 'hou', 'ect', 'george_richards_cbpres', 'austin', 'rr', 'com', 'pm', 'please', 'respond_cbpres_austin', 'rr', 'com', 'phillip_allen_pallen', 'enron', 'com', 'keith_holst_kholst', 'enron', 'com', 'larry_lewter_llewter_austin', 'r

In [0]:
# emails_df = emails_df[:10000].copy() #only use 10'000 mails

In [0]:
emails_df.shape

(517401, 17)

In [0]:
emails_df['norm_subject'] = emails_df['Subject'].map(lambda text:normalize(text))
emails_df['norm_content'] = emails_df['content'].map(lambda text:normalize(text))
emails_df['lemma_content'] = emails_df['norm_content'].map(lambda words:lemmatization(words))

In [0]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(emails_df['lemma_content'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[emails_df['lemma_content']], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [0]:
emails_df['trigram_lemma_content'] = emails_df['lemma_content'].map(lambda words:get_trigrams(words))

In [0]:
for i,d in enumerate(np.array_split(emails_df, 5)):
     d.to_pickle('./data/emails_df_' + str(i) + ".pkl")

In [0]:
files = glob.glob("./data/emails_df_*.pkl")
emails_df = pd.concat([pd.read_pickle(file) for file in files])

In [0]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

In [0]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', -1)

In [0]:
emails_df

Unnamed: 0_level_0,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user,norm_subject,norm_content,lemma_content,trigram_lemma_content
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 16:39:00-07:00,{phillip.allen@enron.com},{tim.belden@enron.com},,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p,[],[forecast],[forecast],[forecast]
<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 13:51:00-07:00,{phillip.allen@enron.com},{john.lavorato@enron.com},Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p,[],"[traveling, business, meeting, takes, fun, tri...","[travel, business, meeting, take, fun, trip, e...","[travel, business, meeting, take, fun, trip, e..."
<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 03:00:00-07:00,{phillip.allen@enron.com},{leah.arsdall@enron.com},Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p,[test],"[test, successful, way, go]","[test, successful, way, go]","[test, successful, way, go]"
<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 06:13:00-07:00,{phillip.allen@enron.com},{randall.gay@enron.com},,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p,[],"[randy, send, schedule, salary, level, everyon...","[randy, send, schedule, salary, level, everyon...","[randy, send, schedule, salary, level, everyon..."
<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 05:07:00-07:00,{phillip.allen@enron.com},{greg.piper@enron.com},Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p,[hello],"[let, shoot, tuesday]","[let, shoot, tuesday]","[let, shoot, tuesday]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
<12135969.1075844349791.JavaMail.evans@thyme>,2000-03-29 08:09:00-08:00,{sheila.glover@enron.com},{sara.shackleton@enron.com},ITG - POSIT,Sheila Glover,Sara Shackleton,David J Vitrella,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-S,sshackle.nsf,"Sara,\nDavid is going to follow-up with his co...",shackleton-s,"[itg, posit]","[sara, david, going, follow, contact, posit, t...","[sara, david, go, follow, contact, posit, tomo...","[sara, david, go, follow, contact, posit, tomo..."
<7048674.1075844312445.JavaMail.evans@thyme>,1999-07-06 04:47:00-07:00,{sara.shackleton@enron.com},{marie.heard@enron.com},Hedge Fund provisions for inclusion in the ISD...,Sara Shackleton,Marie Heard,,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-S,sshackle.nsf,"After our meeting, I also agreed to provide su...",shackleton-s,"[hedge, fund, provisions, inclusion, isda, sch...","[meeting, also, agreed, provide, suggested, la...","[meeting, also, agree, provide, suggest, langu...","[meeting, also, agree, provide, suggest, langu..."
<25401260.1075844349819.JavaMail.evans@thyme>,2000-03-29 08:14:00-08:00,{shari.stack@enron.com},"{william.stuart@enron.com, darren.delage@enron...",Westpac,Shari Stack,"Shane Dallmann, William Stuart, Darren Delage,...","Sara Shackleton, Rod Nelson, Gary Hickerson, L...",,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-S,sshackle.nsf,It is still the case that we are having real p...,shackleton-s,[westpac],"[still, case, real, problems, finalizing, isda...","[still, case, real, problem, finalize, isda, m...","[still, case, real, problem, finalize, isda, m..."
<5942884.1075844349842.JavaMail.evans@thyme>,2000-03-29 08:23:00-08:00,{kaye.ellis@enron.com},{dmitchel@cwt.com},Pulp & Paper Disclaimer,Kaye Ellis,dmitchel@cwt.com,,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-S,sshackle.nsf,"David,\n\nI am sending you lastest Pulp & Pape...",shackleton-s,"[pulp, paper, disclaimer]","[david, sending, lastest, pulp, paper, disclai...","[david, send, last, pulp, paper, disclaimer, s...","[david, send, last, pulp_paper, disclaimer, sa..."


In [0]:
emails_df.dtypes

Date                     object
From                     object
To                       object
Subject                  object
X-From                   object
X-To                     object
X-cc                     object
X-bcc                    object
X-Folder                 object
X-Origin                 object
X-FileName               object
content                  object
user                     object
norm_subject             object
norm_content             object
lemma_content            object
trigram_lemma_content    object
dtype: object