In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
file       517401 non-null object
message    517401 non-null object
dtypes: object(2)
memory usage: 7.9+ MB


In [4]:
# Dictionary for abbreviated names/UID to e-mail address mapping
abrv_names = {}
for row in df['file']:
    name = row.split('/')[0]
    if not name in abrv_names.keys():
        abrv_names[name] = "Temp"

In [5]:
print ("Total unique abbreviated name in file: " + str(len(abrv_names.keys())))
for name in abrv_names.keys():
    print (name)

Total unique abbreviated name in file: 150
allen-p
arnold-j
arora-h
badeer-r
bailey-s
bass-e
baughman-d
beck-s
benson-r
blair-l
brawner-s
buy-r
campbell-l
carson-m
cash-m
causholli-m
corman-s
crandell-s
cuilla-m
dasovich-j
davis-d
dean-c
delainey-d
derrick-j
dickson-s
donoho-l
donohoe-t
dorland-c
ermis-f
farmer-d
fischer-m
forney-j
fossum-d
gang-l
gay-r
geaccone-t
germany-c
gilbertsmith-d
giron-d
griffith-j
grigsby-m
guzman-m
haedicke-m
hain-m
harris-s
hayslett-r
heard-m
hendrickson-s
hernandez-j
hodge-j
holst-k
horton-s
hyatt-k
hyvl-d
jones-t
kaminski-v
kean-s
keavey-p
keiser-k
king-j
kitchen-l
kuykendall-t
lavorato-j
lay-k
lenhart-m
lewis-a
linder-e
lokay-m
lokey-t
love-p
lucci-p
maggi-m
mann-k
martin-t
may-l
mccarty-d
mcconnell-m
mckay-b
mckay-j
mclaughlin-e
merriss-s
meyers-a
mims-thurston-p
motley-m
neal-s
nemec-g
panus-s
parks-j
pereira-s
perlingiere-d
phanis-s
pimenov-v
platter-p
presto-k
quenet-j
quigley-d
rapp-b
reitmeyer-j
richey-c
ring-a
ring-r
rodrique-r
rogers-b
ruscitti-k

In [6]:
max_sent = {}

In [7]:
# Build mapping using X-origin as source
# Find max sent from name
for row in df['message']:
    try:
        uid = row.split('\n')[13].split(' ')[1].lower()
        from_ = row.split('\n')[2].split('@')[0].split(' ')[1]

        # Only map existing uids
        if uid in abrv_names.keys():
            # Ignore one-off corner cases for the mapping purposes only
            if from_ != 'info' and \
                from_ != 'activetrader' and \
                from_ != 'trader' and \
                from_ != 'office-chairman' and \
                from_ != 'office.chairman' and \
                from_ != 'customer-service' and \
                from_ != 'enron.announcements' and \
                from_ != 'doctor' and \
                from_.find('..') == -1:
                    
                if not uid in max_sent.keys():
                    max_sent[uid] = {}

                if not from_ in max_sent[uid].keys():
                    max_sent[uid][from_] = 1
                else:
                    max_sent[uid][from_] += 1

    except Exception as e:
        None

In [8]:
 # Associate uid and job title
for uid in abrv_names.keys():
    max_count = 0
    for from_ in max_sent[uid].keys():
        if max_sent[uid][from_] > max_count:
            abrv_names[uid] = from_
            max_count = max_sent[uid][from_]

In [9]:
# Parse Job Titles Dataset
jt_dict = {}
jt_na_list = []
with open('enron_data.html', 'r') as file_:
    for line in file_.readlines():
        if line != '':
            # ID UID FName LName Rank
            details = line.split()
            uid = details[1]
            rank = ' '.join(details[4:])
            if rank != 'N/A' and rank != 'South N/A':
                jt_dict[uid] = rank
            else:
                jt_na_list.append(uid)

In [10]:
# CEO = 0
# C-Suite = 0
c_count = 0
# President = 1
p_count = 0
# VP = 2
vp_count = 0
# Director = 3
d_count = 0
# IHL = 4
ihl_count = 0
# Manager = 5
m_count = 0
# Trader / Specialist / Employee = 6
tse_count = 0

# UID -> Rank
employee_rank = {}

for rank in jt_dict.keys():
    raw_rank = jt_dict[rank]
    if raw_rank.find('CEO') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 0
        c_count += 1
    elif raw_rank.find('Chief') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 0
        c_count += 1
    elif raw_rank.find('Vice') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 2
        vp_count += 1
    elif raw_rank.find('President') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 1
        p_count += 1
    elif raw_rank.find('Director') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 3
        d_count += 1
    elif raw_rank.find('Lawyer') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 4
        ihl_count += 1
    elif raw_rank.find('Manager') != -1:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 5
        m_count += 1
    else:
        #print (jt_dict[rank])
        #print (rank)
        employee_rank[rank] = 6
        tse_count += 1        

In [11]:
print ('C Level: ' + str(c_count))
print ('Presidents: ' + str(p_count))
print ('VPs: ' + str(vp_count))
print ('Directors: ' + str(d_count))
print ('In-House Lawyers: ' + str(ihl_count))
print ('Managers: ' + str(m_count))
print ('T/S/Es: ' + str(tse_count))

C Level: 8
Presidents: 4
VPs: 17
Directors: 16
In-House Lawyers: 3
Managers: 9
T/S/Es: 51


In [34]:
reverse_abrv_names = {v : k for k, v in abrv_names.items()}
reverse_abrv_names

{'alerts': 'lewis-a',
 'andrea.ring': 'ring-a',
 'andy.zipper': 'zipper-a',
 'barry.tycholiz': 'tycholiz-b',
 'benjamin.rogers': 'rogers-b',
 'bert.meyers': 'meyers-a',
 'beth.cherry': 'wolfe-j',
 'bill.iii': 'merriss-s',
 'bill.rapp': 'rapp-b',
 'bill.williams': 'williams-w3',
 'brad.mckay': 'mckay-b',
 'cara.semperger': 'semperger-c',
 'carol.clair': 'stclair-c',
 'charles.weldon': 'weldon-c',
 'chris.dorland': 'dorland-c',
 'chris.germany': 'germany-c',
 'chris.stokley': 'stokley-c',
 'cooper.richey': 'richey-c',
 'dan.hyvl': 'hyvl-d',
 'dana.davis': 'davis-d',
 'danny.mccarty': 'mccarty-d',
 'daren.farmer': 'farmer-d',
 'darrell.schoolcraft': 'schoolcraft-d',
 'darron.giron': 'giron-d',
 'david.delainey': 'delainey-d',
 'debra.perlingiere': 'perlingiere-d',
 'dgagliardi': 'dean-c',
 'diana.scholtes': 'scholtes-d',
 'don.baughman': 'baughman-d',
 'doug.gilbert-smith': 'gilbertsmith-d',
 'drew.fossum': 'fossum-d',
 'dutch.quigley': 'quigley-d',
 'elizabeth.sager': 'sager-e',
 'eric.b

In [58]:
# from_:to_ {message}
#unique to_ for testing
#to_set = set([])

from_to_ = {}
for index, row in df.iterrows():
    row = row['message']
    from_ = row.split('\n')[2].split('@')[0].split(' ')[1]
    to_ = row.split('\n')[3].split('@')[0].split(' ')[1]
    
    #to_set.add(to_)
    if to_ in reverse_abrv_names.keys() and from_ in reverse_abrv_names.keys() and to_ != from_:
        f_uid = reverse_abrv_names[from_]
        t_uid = reverse_abrv_names[to_]
        if f_uid in employee_rank.keys() and t_uid in employee_rank.keys():
            try:
                subject = row.split('\n')[4].split(' ')[1]
            except:
                subject = row.split('\n')[4]

            message = ''.join(row.split('\n')[15:])
            if message.find('Forwarded') != -1:
                message = 'Forwarded'

            rlshp = from_ + ':' + to_
            if rlshp in from_to_.keys():
                from_to_[rlshp] += ' ' + subject + ' ' + message
            else:
                from_to_[rlshp] = subject + ' ' + message        
    

In [59]:
print ("# Relationships: " + str(len(from_to_.keys())))

# Relationships: 719


In [60]:
# Convert to label -> text
pre_prep_format = []
u_count = 0
nu_count = 0

for key in from_to_.keys():
    from_, to_ = key.split(':')
    f_uid = reverse_abrv_names[from_]
    f_rank = employee_rank[f_uid]
    t_uid = reverse_abrv_names[to_]
    t_rank = employee_rank[t_uid]
    
    if f_rank > t_rank:
        pre_prep_format.append(('upward', from_to_[key]))
        u_count += 1
    elif f_rank < t_rank:
        nu_count += 1
        pre_prep_format.append(('not-upward', from_to_[key]))
    #else:
    #    pre_prep_format.append(('neutral', from_to_[key]))

In [61]:
print ("Labelled messages: " + str(len(pre_prep_format)))
print ("Upward messages: " + str(u_count))
print ("Not-upward messages: " + str(nu_count))

Labelled messages: 494
Upward messages: 256
Not-upward messages: 238


In [105]:
# Text prep
# Punctuation removal
# \t \n \\ replacements
# stop word removal
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

post_prep_format = []

for label, message in pre_prep_format:
    processed_message = message.lower()
    
    # \t \n \\
    processed_message = processed_message.replace('\t', ' ')
    processed_message = processed_message.replace('\n', ' ')
    
    # Punctuation
    processed_message = re.sub(r'\W+', ' ', processed_message)
    
    # Stop words
    processed_message = ' '.join([w for w in processed_message.split() if w not in stop])
    
    # Return
    post_prep_format.append((label, processed_message))

In [106]:
print (pre_prep_format[50])

('upward', 's..bradford@enron.com, X-bcc: X-Folder: \\sbeck\\Deleted ItemsX-Origin: BECK-SX-FileName: sally beck 1-28-02.pstToday, the Euros are coming and John is going to be running this negotiation - here\'s the bad news they are not coming until 3:30pm today so we may have to work late.Barry Tycholiz will be running this team. Barry - talk to Andy on form if you need help - you will also need to think about getting some traders available over week-end.The Schedule outline is based on initial overview discussions/presentations  and then more detailed discussion can occur in individual groups the outline is as follows:It may be better to split up earlier but I leave all those decisions to Andy.  4:00\t\tIntroduction\t\t\tJohn Lavorato & Louise Kitchen4:45\t\tFinancial Overview\t\tWes Colwell5:15\t\tRisk Overview\t\t\tBuy / Port5:45\t\tCredit\t\t\t\tBradford6:15\t\tBack Office\t\t\tBeck/Hall\t6:45\t\tSystems (EnronOnline)\t\tJay Webb\t7:15\t            Mid/Back Office Systems\tJay Web

In [107]:
print (post_prep_format[50])

('upward', 'bradford enron com x bcc x folder sbeck deleted itemsx origin beck sx filename sally beck 1 28 02 psttoday euros coming john going running negotiation bad news coming 3 30pm today may work late barry tycholiz running team barry talk andy form need help also need think getting traders available week end schedule outline based initial overview discussions presentations detailed discussion occur individual groups outline follows may better split earlier leave decisions andy 4 00 introduction john lavorato louise kitchen4 45 financial overview wes colwell5 15 risk overview buy port5 45 credit bradford6 15 back office beck hall 6 45 systems enrononline jay webb 7 15 mid back office systems jay webb infrastructure jenny rub please make available meetings take place 2991tammie please organize lunch refreshments etc john card tammie please check availability thanks continued understanding louise bradford enron com x bcc x folder rbuy inboxx origin buy rx filename richard buy 1 30 0

In [125]:
df = pd.DataFrame(post_prep_format, columns=['label', 'message'])

In [126]:
df_train = df[:round(df.shape[0] *.7)]
df_test = df[round(df.shape[0] *.7):]

In [159]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np

tfidf_transformer = TfidfTransformer()
baseline_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),
])

In [160]:
baseline_clf.fit(df_train['message'], df_train['label'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [161]:
predicted = baseline_clf.predict(df_test['message'])
print ("Initial baseline accuracy: " + str(np.mean(predicted == df_test['label'])))

Initial baseline accuracy: 0.5202702702702703
