In [1]:
import numpy as np
import pandas as pd
import re
import random
import email
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics 
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD

from scipy.sparse import coo_matrix, hstack



In [2]:
enron_data = pd.read_csv("emails.csv", header=0, quoting=2)

In [3]:
enron_sent = enron_data[enron_data["file"].str.contains('sent').tolist()]

In [4]:
In [14]: enron_sent['start'], enron_sent['end'], enron_sent['fileno'] = zip(*enron_sent['file'].map(lambda x: x.split('/')))
enron_sent.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,file,message,start,end,fileno
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,allen-p,_sent_mail,1.0
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,allen-p,_sent_mail,10.0
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,allen-p,_sent_mail,100.0
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,allen-p,_sent_mail,1000.0
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,allen-p,_sent_mail,1001.0


In [5]:
enron_sent = enron_sent.assign(sender=enron_sent["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
enron_sent.drop("file", axis=1, inplace=True)
enron_sent["sender"].value_counts().head(10)

mann-k          8926
kaminski-v      8644
dasovich-j      5366
germany-c       5128
shackleton-s    4407
jones-t         4123
bass-e          3030
lenhart-m       2759
beck-s          2674
symes-k         2649
Name: sender, dtype: int64

In [6]:
def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result
enron_parsed = pd.DataFrame(list(map(email_from_string, enron_sent.message)))
enron_parsed.head(1)

Unnamed: 0,Bcc,Cc,Content-Transfer-Encoding,Content-Type,Date,From,Message-ID,Mime-Version,Subject,To,X-FileName,X-Folder,X-From,X-Origin,X-To,X-bcc,X-cc,content
0,,,7bit,text/plain; charset=us-ascii,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,<18782981.1075855378110.JavaMail.evans@thyme>,1.0,,tim.belden@enron.com,pallen (Non-Privileged).pst,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Phillip K Allen,Allen-P,Tim Belden <Tim Belden/Enron@EnronXGate>,,,Here is our forecast\n\n


In [419]:
def remove_fwd_org(test_email):
    lines  = test_email.split("\n")
    i = 0
    Fowarded = "-- Forwarded"
    Original = "-----Original Message-----"
    N = len(lines)
    while i < N:
        line  = lines[i]
        if Fowarded in line:
            lines = lines[:i]
            i = N
        if Original in line:
            lines = lines[:i]
            i = N
        if "Sent by:" in line:
            lines = lines[:i]
            i = N
        if "From:" in line:
            lines = lines[:i]
            i = N
        if "Subject:" in line:
            lines = lines[:i-7]
            i = N
        if "To:" in line:
            lines = lines[:i-3]
            i = N
        i+=1

    lines  = [line+"\n" for line in lines]

    new_email = "".join(lines).strip("\n")
    return new_email

In [416]:
df = enron_parsed[enron_parsed.From.str.contains("enron")]
df = df[:][~pd.isnull(df.To)]
df = df[df.To.str.contains("enron")]
df = df[:][pd.isnull(df.Bcc)]
df = df[:][pd.isnull(df.Cc)]
df = df[:][df.To.apply(lambda x: len(str(x).split(","))) == 1]
df.content = df.content.str.strip("\n")
df = df[:][~df.content.str.split("\n").apply(lambda x : "-- Forwarded" in x[0])]
df = df[:][~df.content.str.split("\n").apply(lambda x : "-- Inline" in x[0])]
df["content"] = df.content.apply(remove_fwd_org)
df = df[:][~(df.content == "")]
df["Date"] =  pd.to_datetime(df["Date"], infer_datetime_format=True)
df["Time"] = df["Date"].dt.hour
df["Sender"]=df.From.str.strip("enron.com").str.strip("@")
df["Recipient"]=df.To.str.strip("enron.com").str.strip("@")
df = df[:][~(df.Sender.apply(lambda x: ".." in x))]
df = df[:][~(df.Recipient.apply(lambda x: ".." in x))]
df = df[:][df.Sender.apply(lambda x: "." in x)]
df = df[:][df.Recipient.apply(lambda x: "." in x)]
df = df[:][df.Sender.apply(lambda x: "legal" not in x)]
df = df[:][df.Recipient.apply(lambda x: "legal" not in x)]
df = df[:][df.Sender.apply(lambda x: "trading" not in x)]
df = df[:][df.Recipient.apply(lambda x: "trading" not in x)]
df = df[:][df.Sender.apply(lambda x: "houston" not in x)]
df = df[:][df.Recipient.apply(lambda x: "houston" not in x)]

In [417]:
len(df)

50544

In [418]:
list(np.sort(df.Sender.unique()))

['adhup.kumar',
 'aig.dean',
 'albert.meyers',
 'alhamd.alkhayat',
 'amelia.alder',
 'andall.gay',
 'andrea.ring',
 'andrew.lewis',
 'andy.zipper',
 'angela.mcculloch',
 'ara.semperger',
 'argaret.carson',
 'arie.heard',
 'ark.fisher',
 'ark.guzman',
 'ark.haedicke',
 'ark.koenig',
 'ark.mcconnell',
 'ark.taylor',
 'ark.whitt',
 'arl.tricoli',
 'arol.clair',
 'arol.coats',
 'arol.kincannon',
 'artha.benner',
 'artin.cuilla',
 'ary.fischer',
 'ary.hain',
 'athy.phillips',
 'att.smith',
 'atthew.lenhart',
 'aureen.mcvicker',
 'aureen.raymond',
 'b.gay',
 'b.wilson',
 'barry.tycholiz',
 'benjamin.rogers',
 'bert.badeer',
 'bert.benson',
 'bill.rapp',
 'bill.williams',
 'bin.rodrigue',
 'brad.mckay',
 'brenda.whitehead',
 'd.hayslett',
 'dan.hyvl',
 'dana.davis',
 'danny.mccarty',
 'daren.farmer',
 'darrell.schoolcraft',
 'darron.giron',
 'david.delainey',
 'david.forster',
 'debra.bailey',
 'debra.perlingiere',
 'diana.scholtes',
 'don.baughman',
 'doug.gilbert-smith',
 'drew.fossum',
 'd

In [359]:
rando = random.randint(0, df.shape[0])
print(df.iloc[rando, -1])

Just wanted to let you know,  Royster has closed,  we have fully executed 
agreements.


Dp


Debra Perlingiere
Enron North America Corp.
Legal Department
1400 Smith Street, EB 3885
Houston, Texas 77002
dperlin@enron.com
Phone 713-853-7658
Fax  713-646-3490
