In [1]:
import numpy as np
import pandas as pd
import re
import random
import email
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics 
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD

from scipy.sparse import coo_matrix, hstack

In [2]:
enron_data = pd.read_csv("enron-email-dataset/emails.csv", header=0, quoting=2)

In [3]:
enron_data.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
# filtering only those that contain 'sent' in file name (f.e _sent_mail, sent_mail, sent etc) 

enron_sent = enron_data[enron_data["file"].str.contains('sent').tolist()]

In [5]:
# extracting senders (there might me cases like "orgname/sender" but so far as we need only top 10 senders we are ok)

enron_sent = enron_sent.assign(sender=enron_sent["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
enron_sent.drop("file", axis=1, inplace=True)
enron_sent["sender"].value_counts().head(10)

mann-k          8926
kaminski-v      8644
dasovich-j      5366
germany-c       5128
shackleton-s    4407
jones-t         4123
bass-e          3030
lenhart-m       2759
beck-s          2674
symes-k         2649
Name: sender, dtype: int64

In [6]:
# mapping top senders' names to use later as label series
# we work only with top 10 senders

top_senders = enron_sent["sender"].value_counts().head(10).index.values
mapping = dict(zip(top_senders, range(10)))
print mapping

{'beck-s': 8, 'mann-k': 0, 'dasovich-j': 2, 'bass-e': 6, 'jones-t': 5, 'germany-c': 3, 'lenhart-m': 7, 'kaminski-v': 1, 'symes-k': 9, 'shackleton-s': 4}


In [7]:
# info

print enron_sent.shape
print enron_sent[enron_sent.sender.isin(top_senders)].shape

enron_sent = enron_sent[enron_sent.sender.isin(top_senders)]

(126846, 2)
(47706, 2)


In [74]:
# now let's take a look at random email

print enron_sent.iloc[random.randint(0, enron_sent.shape[0]), 0]

Message-ID: <19430372.1075846045376.JavaMail.evans@thyme>
Date: Fri, 20 Apr 2001 00:19:00 -0700 (PDT)
From: kay.mann@enron.com
To: ccampbell@kslaw.com, stephen.thome@enron.com, jake.thomas@enron.com
Subject: RE: GE Guaranty Comments
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Kay Mann
X-To: ccampbell@kslaw.com, Stephen Thome, Jake Thomas
X-cc: 
X-bcc: 
X-Folder: \Kay_Mann_June2001_4\Notes Folders\'sent mail
X-Origin: MANN-K
X-FileName: kmann.nsf

---------------------- Forwarded by Kay Mann/Corp/Enron on 04/20/2001 07:18 
AM ---------------------------


"Shoemaker, Kent (GEAE)" <kent.shoemaker@ae.ge.com> on 04/20/2001 07:11:05 AM
To: "'kay.mann@enron.com'" <kay.mann@enron.com>
cc: "Johnson, Lee L (PS, GE AEP)" <lee.johnson@ps.ge.com> 

Subject: RE: GE Guaranty Comments

This is okay.

-----Original Message-----
From: Shoemaker, Kent (PS, SSEP) 
Sent: Thursday, April 19, 2001 7:03 PM
To: Shoemaker, Kent (GEAE)
Subject: FW: GE Gua

In [9]:
# I use default email library just for simplicity. For real product I would use more complicated parsing tools or write my own
# We extract email artificials and content from raw text

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result

In [10]:
enron_parsed = pd.DataFrame(list(map(email_from_string, enron_sent.message)))

In [11]:
enron_parsed.head(1)

Unnamed: 0,Bcc,Cc,Content-Transfer-Encoding,Content-Type,Date,From,Message-ID,Mime-Version,Subject,To,X-FileName,X-Folder,X-From,X-Origin,X-To,X-bcc,X-cc,content
0,,,7bit,text/plain; charset=us-ascii,"Fri, 9 Mar 2001 11:24:00 -0800 (PST)",eric.bass@enron.com,<17027752.1075840325838.JavaMail.evans@thyme>,1.0,Rebook - QU0663 Mirant,chance.rabon@enron.com,eric bass 6-25-02.PST,"\ExMerge - Bass, Eric\'Sent Mail",Eric Bass,BASS-E,Chance Rabon <Chance Rabon/ENRON@enronXgate>,,,\n---------------------- Forwarded by Eric Bas...


In [12]:
# cc and bcc stand for carbon copy and blind carbon copy and that may be useful for classification
# Also we might use "To" or any other metadata but I believe the idea of this work is to use simply "content" + "subject" 

enron_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47706 entries, 0 to 47705
Data columns (total 18 columns):
Bcc                          9259 non-null object
Cc                           9259 non-null object
Content-Transfer-Encoding    47706 non-null object
Content-Type                 47706 non-null object
Date                         47706 non-null object
From                         47706 non-null object
Message-ID                   47706 non-null object
Mime-Version                 47706 non-null object
Subject                      47706 non-null object
To                           47640 non-null object
X-FileName                   47706 non-null object
X-Folder                     47706 non-null object
X-From                       47706 non-null object
X-Origin                     47706 non-null object
X-To                         47706 non-null object
X-bcc                        47706 non-null object
X-cc                         47706 non-null object
content                   

In [96]:
#here we do simply two things: 1 remove numbers and 2 remove stowords using nltk stopwords corpus

def content_to_wordlist( content, remove_stopwords=False ):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return ' '.join(words)

In [14]:
# enron_parsed['To'] = enron_parsed['To'].astype(str) # in case we want to use 'To' as information
data = pd.DataFrame(map(content_to_wordlist, 
                              enron_parsed[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1)), 
                          columns = ["content"])

In [15]:
data = data.assign(sender=enron_sent["sender"].values)
data = data.replace({'sender': mapping})
data.head()

Unnamed: 0,content,sender
0,rebook qu mirant forwarded by eric bass hou ec...,6
1,for your viewing pleasure forwarded by eric ba...,6
2,re fw christmas i think we are going to stay i...,6
3,re i didn t go either today is legs and lower ...,6
4,fwd the perils of limbo forwarded by eric bass...,6


In [16]:
# now we split data for training and test sets

data_train, data_test, y_train, y_test = train_test_split(data.content.values, data.sender.values, test_size=0.25)

In [23]:
data_splitted  = data.content.map(lambda x: x.split(" "))

In [106]:
##import logging
##logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 6           # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(data_splitted, workers=num_workers, size=num_features, min_count = min_word_count, 
                          window = context, sample = downsampling)

model.init_sims(replace=True)
model_name = "300features_40minwords_6context"
model.save(model_name)

2017-03-31 14:35:58,907 : INFO : collecting all words and their counts
2017-03-31 14:35:58,910 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2017-03-31 14:35:59,319 : INFO : PROGRESS: at sentence #10000, processed 2574100 words, keeping 37174 word types
2017-03-31 14:35:59,573 : INFO : PROGRESS: at sentence #20000, processed 4233185 words, keeping 47351 word types
2017-03-31 14:35:59,960 : INFO : PROGRESS: at sentence #30000, processed 6708828 words, keeping 66923 word types
2017-03-31 14:36:00,270 : INFO : PROGRESS: at sentence #40000, processed 8452332 words, keeping 72943 word types
2017-03-31 14:36:00,477 : INFO : collected 78511 word types from a corpus of 9810816 raw words and 47706 sentences
2017-03-31 14:36:00,479 : INFO : Loading a fresh vocabulary
2017-03-31 14:36:00,562 : INFO : min_count=40 retains 10897 unique words (13% of original 78511, drops 67614)
2017-03-31 14:36:00,562 : INFO : min_count=40 leaves 9398465 word corpus (95% of original 9810816, drops 412351)
2017-03-31 14:36:00,614 : INFO : deleting the raw counts dictionary of 78511 items
2017-03-31 14:36:00,618 : INFO : sample=0.001 downsamples 50 most-c

In [86]:
model.wv.syn0.shape

(10897L, 300L)

In [87]:
def makeFeatureVec(text, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    words = text.split(" ")
    
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)

    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])

    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 

    counter = 0.

    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")

    for review in reviews:
        if counter%1000. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [98]:
X = getAvgFeatureVecs(data.content, model, 300)

Review 0 of 47706
Review 1000 of 47706
Review 2000 of 47706
Review 3000 of 47706
Review 4000 of 47706
Review 5000 of 47706
Review 6000 of 47706
Review 7000 of 47706
Review 8000 of 47706
Review 9000 of 47706
Review 10000 of 47706
Review 11000 of 47706
Review 12000 of 47706
Review 13000 of 47706
Review 14000 of 47706
Review 15000 of 47706
Review 16000 of 47706
Review 17000 of 47706
Review 18000 of 47706
Review 19000 of 47706
Review 20000 of 47706
Review 21000 of 47706
Review 22000 of 47706
Review 23000 of 47706
Review 24000 of 47706
Review 25000 of 47706
Review 26000 of 47706
Review 27000 of 47706
Review 28000 of 47706
Review 29000 of 47706
Review 30000 of 47706
Review 31000 of 47706
Review 32000 of 47706
Review 33000 of 47706
Review 34000 of 47706
Review 35000 of 47706
Review 36000 of 47706
Review 37000 of 47706
Review 38000 of 47706
Review 39000 of 47706
Review 40000 of 47706
Review 41000 of 47706
Review 42000 of 47706
Review 43000 of 47706
Review 44000 of 47706
Review 45000 of 47706
R

In [99]:
X.shape

(47706L, 300L)

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, data.sender.values, test_size=0.25)

In [101]:
print X_train.shape
print X_test.shape

(35779L, 300L)
(11927L, 300L)


In [102]:
notnoneix = ~np.isnan(X_train).any(axis=1)
notnoneix_test = ~np.isnan(X_test).any(axis=1)

In [103]:
print X_train[notnoneix].shape
print X_test[notnoneix_test].shape

(35770L, 300L)
(11923L, 300L)


In [104]:
clf = LinearSVC()
clf.fit(np.nan_to_num(X_train[notnoneix]), y_train[notnoneix])

print metrics.accuracy_score(y_test[notnoneix_test], clf.predict(X_test[notnoneix_test]))

0.892896083201


In [105]:
print(classification_report(y_test, clf.predict(np.nan_to_num(X_test))))

             precision    recall  f1-score   support

          0       0.93      0.93      0.93      2216
          1       0.90      0.95      0.92      2193
          2       0.88      0.92      0.90      1326
          3       0.89      0.86      0.87      1256
          4       0.93      0.89      0.91      1087
          5       0.90      0.84      0.87       956
          6       0.85      0.78      0.82       779
          7       0.73      0.79      0.76       721
          8       0.87      0.91      0.89       679
          9       0.97      0.88      0.92       714

avg / total       0.89      0.89      0.89     11927

