### Import Libraries and Packages

In [1]:
from os import walk
from warnings import filterwarnings
import sys, email, csv
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import codecs
import sklearn as sk
import pickle

In [2]:
%matplotlib inline

### Import the Kaggle-Enron emails file that has been aggregated into one CSV file.

In [3]:
# Load Enron data
emails_df = pd.read_csv('emails.csv', parse_dates=True, engine="python", delimiter=",", encoding='utf-8', error_bad_lines=False)

### Inspect the size and initial values.

In [8]:
emails_df.shape

(843222, 2)

In [5]:
emails_df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
# Replace NA values from initial set
emails_df_NA = emails_df.replace(to_replace='None', value=np.nan).dropna()

In [7]:
emails_df_NA.shape

(554134, 2)

In [8]:
emails_df_NA.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


# Data Wrangling

### These data cleaning steps for the Kaggle-Enron file extract the text from the message header and create additional features (to, from, content, etc.)

In [5]:
## Helper function 1
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

In [6]:
## Helper function 2
def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [None]:
# Map Enron email data to be added into pandas dataframe
reload(sys)
sys.setdefaultencoding('utf8')

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df_NA['message']))
emails_df_NA.drop('message', axis=1, inplace=True)

# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df_NA[key] = [doc[key] for doc in messages]

# Parse content from emails
emails_df_NA['content'] = list(map(get_text_from_email, messages))

# Split multiple email addresses
emails_df_NA['From'] = emails_df_NA['From'].map(split_email_addresses)
emails_df_NA['To'] = emails_df_NA['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df_NA['user'] = emails_df_NA['file'].map(lambda x:x.split('/')[0])
del messages

In [9]:
emails_df_NA.head()

Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,allen-p/_sent_mail/1.,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",(phillip.allen@enron.com),(tim.belden@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,allen-p/_sent_mail/10.,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,allen-p/_sent_mail/100.,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,allen-p/_sent_mail/1000.,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",(phillip.allen@enron.com),(randall.gay@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,allen-p/_sent_mail/1001.,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


### This data cleaning is for the email tags. These tags are then joined to the Kaggle-Enron set based the uniquely identifible "date" field.

In [10]:
# Gather training tags
csvFileArray = []
outlst = []

for directory in walk("."):
    for fle in directory[2]:
        if "personal" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = '.'):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("personal", fle, dte))
        
        elif "business" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = '.'):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("business", fle, dte))
            
        elif "chain_mails" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = "."):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("chain_mails", fle, dte))
            
        elif "enron_online" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = "."):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("enron_online", fle, dte))
            
        elif "general_announcements" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = "."):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("general_announcements", fle, dte))
            
        elif "human_resources" in directory[0] and ".csv" in fle:
            csvFileArray = []
            path = directory[0] + "/" + fle
            csvfile = codecs.open(path, 'rb')
            for row in csv.reader(csvfile, delimiter = "."):
                csvFileArray.append(row)
            dte = str(csvFileArray[0])
            dte = dte[8:-2]
            outlst.append(("human_resources", fle, dte))
            

In [11]:
# Load training tags into pandas dataframe
tags = pd.DataFrame(list(outlst), columns=['tags','file','date'])

In [12]:
# Join the two datasets
emails_merged_full = pd.merge(emails_df_NA, tags, left_on = 'Date', right_on = 'date', how = 'inner')

In [13]:
# Number of training samples
emails_merged_full.shape

(12607, 21)

### Here is our full dataset ready to be used for machine learning modeling.

In [17]:
# Curated dataset ready for scikit-learn tuning
emails_merged_full.head()

Unnamed: 0,file_x,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,...,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user,tags,file_y,date
0,allen-p/_sent_mail/26.,<15164543.1075855378954.JavaMail.evans@thyme>,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re: This morning's Commission meeting delayed,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\n---------------------- Forwarded by Phillip ...,allen-p,enron_online,Ron.williams-w3.timbelden.2.csv,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)"
1,allen-p/sent_items/61.,<24466219.1075858638656.JavaMail.evans@thyme>,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re: This morning's Commission meeting delayed,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,...,,,"\PALLEN (Non-Privileged)\Allen, Phillip K.\Sen...",Allen-P,PALLEN (Non-Privileged).pst,\n---------------------- Forwarded by Phillip ...,allen-p,enron_online,Ron.williams-w3.timbelden.2.csv,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)"
2,williams-w3/timbelden/2.,<12767107.1075840010399.JavaMail.evans@thyme>,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)",(tim.belden@enron.com),"(robert.badeer@enron.com, bill.iii@enron.com, ...",Daily EOL/ICE Summary 4/24,1.0,text/plain; charset=us-ascii,7bit,Tim Belden <Tim Belden/HOU/ECT@ECT>,...,,,"\ExMerge - Williams III, Bill\TimBelden",WILLIAMS-W3,,The last couple of days on eol have been amazi...,williams-w3,enron_online,Ron.williams-w3.timbelden.2.csv,"Wed, 25 Apr 2001 16:52:00 -0700 (PDT)"
3,allen-p/_sent_mail/440.,<29628244.1075855725718.JavaMail.evans@thyme>,"Mon, 19 Mar 2001 00:45:00 -0800 (PST)",(phillip.allen@enron.com),(llewter@austin.rr.com),Re: Buyout,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,...,,,\Phillip_Allen_June2001\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Larrry,\n\nI realize you are disappointed abou...",allen-p,business,Ron.beck-s.apollo__beth.9.csv,"Mon, 19 Mar 2001 00:45:00 -0800 (PST)"
4,allen-p/all_documents/459.,<18425275.1075855696118.JavaMail.evans@thyme>,"Mon, 19 Mar 2001 00:45:00 -0800 (PST)",(phillip.allen@enron.com),(llewter@austin.rr.com),Re: Buyout,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,...,,,\Phillip_Allen_June2001\Notes Folders\All docu...,Allen-P,pallen.nsf,"Larrry,\n\nI realize you are disappointed abou...",allen-p,business,Ron.beck-s.apollo__beth.9.csv,"Mon, 19 Mar 2001 00:45:00 -0800 (PST)"


### Since it would be nice to only do the data cleaning process once, pickle allows you to save the data frame for recollection each time one loads the jupyter notebook from scratch.

In [73]:
# Pickle process to save data in case you don't want to rerun the jupyter notebook
pickle.dump( emails_merged_full, open( "save.p", "wb" ) )

In [74]:
# Pickle process to load data
emails_merged_full = pickle.load( open( "save.p", "rb" ) )

In [14]:
target_vals = ['business', 'general_annoucements', 'enron_online',
               'personal', 'chain_emails', 'human_resources']

### This count vectorizer step takes our content within the emails and converts the content into a sparse matrix where the values are given by how frequent a word appears.

In [15]:
# From http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# Count Vectorize the data

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(emails_merged_full['content'])

In [22]:
X_train_counts.shape

(12607, 38051)

In [23]:
count_vect.vocabulary_.get(u'enron')

14233

### TF-IDF (Term Frequency - Inverse Document Frequency) is used to weight the frequency of the words counted against the overall number of times a word appears in all documents. This allows the modeler to remove common words (the, it, or, and, etc.) from being preceived as important values to the classifiers below.

In [16]:
# Tf-idf the content
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(12607, 38051)

In [17]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(12607, 38051)

### In order to validate our models later, split the data into a test and training set.

In [19]:
# split train and test set
from sklearn.model_selection import train_test_split

enron_train, enron_test = train_test_split(emails_merged_full, test_size=0.2)

emails_test = enron_test.content

In [32]:
enron_test.shape

(2522, 21)

In [33]:
X_new_counts_2.shape

(2522, 36747)

In [34]:
enron_train.shape

(10085, 21)

In [35]:
predicted_2.shape

(2522,)

# Modeling/Machine Learning

### Our first model, Multinomial Naive Bayes, produces a probability for each email in each classification and tags the email with the highest value to that class.

In [21]:
# Predict Values

# This "pipeline" actually works.
from sklearn.naive_bayes import MultinomialNB

# Taking the training data and turning it into a count feature vector
count_vect_2 = CountVectorizer()
X_train_counts_2 = count_vect_2.fit_transform(enron_train.content)

# Turning the vector into a weighted feature vector
tfidf_transformer_2 = TfidfTransformer()
X_train_tfidf_2 = tfidf_transformer_2.fit_transform(X_train_counts_2)

# Using a specific algo to train the model
clf_2 = MultinomialNB().fit(X_train_tfidf_2, enron_train.tags)

# Prepare the test data into a count vector
X_new_counts_2 = count_vect_2.transform(enron_test.content)
# Turning the vector into a weighted feature vector for the test data
X_new_tfidf_2 = tfidf_transformer_2.transform(X_new_counts_2)

# Making a prediction for the test data
predicted_2 = clf_2.predict(X_new_tfidf_2)

# Measure the accuracy
np.mean(predicted_2 == enron_test.tags)

0.74900872323552736

### Our second model, a SVC, separates the training data into different hyperplanes until each classification is clearly defined.

In [22]:
#LinearSupportVectorClassifier
from sklearn.svm import LinearSVC

count_vect_3 = CountVectorizer()
X_train_counts_3 = count_vect_3.fit_transform(enron_train.content)

tfidf_transformer_3 = TfidfTransformer()
X_train_tfidf_3 = tfidf_transformer_3.fit_transform(X_train_counts_3)

clf_3 = LinearSVC().fit(X_train_tfidf_3, enron_train.tags)

X_new_counts_3 = count_vect_3.transform(enron_test.content)
X_new_tfidf_3 = tfidf_transformer_3.transform(X_new_counts_3)

predicted_3 = clf_3.predict(X_new_tfidf_3)

np.mean(predicted_3 == enron_test.tags)

0.94448850118953209

### Our third model, stocastic gradient descent, fits a line to the data by continuously updating the importance of each word in the data set to the fitted line. This updating these weights after enough iterations produces trend lines that allow for the model to accurately place new data points and correctly classify the test data.

In [23]:
#StocasticGradientDescentClassifier
from sklearn.linear_model import SGDClassifier

count_vect_4 = CountVectorizer()
X_train_counts_4 = count_vect_4.fit_transform(enron_train.content)

tfidf_transformer_4 = TfidfTransformer()
X_train_tfidf_4 = tfidf_transformer_4.fit_transform(X_train_counts_4)

clf_4 = SGDClassifier().fit(X_train_tfidf_4, enron_train.tags)

X_new_counts_4 = count_vect_4.transform(enron_test.content)
X_new_tfidf_4 = tfidf_transformer_4.transform(X_new_counts_4)

predicted_4 = clf_4.predict(X_new_tfidf_4)

np.mean(predicted_4 == enron_test.tags)

0.93021411578112612

# Reporting

### For each classification model, a report and confusion matrix was printed to show the precision and accuracy of each model (F1-score), as well as which classifications the models were correctly (or incorrectly) assigning emails to.

In [24]:
#Multinomial Naive Bayes
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

from sklearn import metrics
report_2 = metrics.classification_report(enron_test.tags, predicted_2)
print(report_2)
# , target_names = target_vals

In [25]:
metrics.confusion_matrix(enron_test.tags, predicted_2)

array([[698,   0,   0,  90,   0,   7],
       [  2,   0,   0,   0,   0,   4],
       [ 35,   0,   9,  12,   0,   0],
       [ 52,   0,   0, 815,   0,   1],
       [156,   0,   0,  70,  18,   2],
       [161,   0,   0,  41,   0, 349]])

In [26]:
#SVC
report_3 =  metrics.classification_report(enron_test.tags, predicted_3)
print(report_3)

In [27]:
metrics.confusion_matrix(enron_test.tags, predicted_3)

array([[752,   0,   1,  11,   8,  23],
       [  1,   5,   0,   0,   0,   0],
       [  6,   0,  44,   6,   0,   0],
       [ 22,   0,   0, 840,   1,   5],
       [ 16,   0,   0,   4, 222,   4],
       [ 23,   0,   0,   6,   3, 519]])

In [42]:
#SGD
report_4 =  metrics.classification_report(enron_test.tags, predicted_4)
print(report_4)

In [43]:
metrics.confusion_matrix(enron_test.tags, predicted_4)

array([[764,   0,   1,  24,   8,  17],
       [  0,   5,   0,   0,   0,   1],
       [ 10,   0,  48,   0,   0,   1],
       [ 25,   0,   9, 865,   5,   1],
       [ 36,   0,   0,   4, 186,   8],
       [ 35,   0,   0,   6,   4, 459]])

In [None]:
# Sorting the indices shows that there's no overlapping data in both the 
# training and test sets. This confirms that from sklearn.model_selection import train_test_split
# worked and validates our training runnings.
enron_train.sort_index()

In [None]:
enron_test.sort_index()

In [44]:
# This will match the word index back to the particular class
clf_2.feature_log_prob_


array([[ -7.83492143,  -7.94785047, -11.02964212, ..., -11.02964212,
        -10.94883206, -11.02964212],
       [-10.48615122, -10.52820499, -10.52820499, ..., -10.52820499,
        -10.52820499, -10.52820499],
       [ -9.51673258,  -9.70873553, -10.56976036, ..., -10.56976036,
        -10.56976036, -10.56976036],
       [ -7.93176251,  -9.11840093, -10.93987481, ..., -11.12588365,
        -11.12588365, -11.12588365],
       [ -7.79185651,  -8.80525667, -10.71273363, ..., -10.71273363,
        -10.71273363, -10.71273363],
       [ -7.94593507,  -9.34947906, -10.67568843, ..., -10.68552128,
        -10.87273204, -10.86306173]])

In [28]:
count_vect_2.vocabulary_

{u'sowell': 31537,
 u'chudson': 9256,
 u'woods': 36299,
 u'paolis': 25426,
 u'hanging': 16949,
 u'woody': 36308,
 u'lenci': 20987,
 u'5989': 3100,
 u'hermans': 17358,
 u'caney': 8432,
 u'5981': 3098,
 u'5984': 3099,
 u'rtariff': 29561,
 u'hermann': 17357,
 u'dswerst': 12867,
 u'hermani': 17356,
 u'capoeira': 8477,
 u'kimmorrell': 20213,
 u'rawhide': 27924,
 u'bringing': 7850,
 u'four': 15464,
 u'energy_plans_comparison': 13746,
 u'grueling': 16650,
 u'wednesday': 35843,
 u'coordinate': 10447,
 u'inevitably': 18444,
 u'tumey': 34369,
 u'0057': 82,
 u'270': 1678,
 u'271': 1683,
 u'272': 1687,
 u'273': 1689,
 u'274': 1691,
 u'275': 1695,
 u'276': 1698,
 u'277': 1700,
 u'278': 1703,
 u'279': 1705,
 u'd5452': 11159,
 u'gmommyclark': 16273,
 u'peitiion': 25765,
 u'deferred': 11585,
 u'cooking': 10427,
 u'hockenbery': 17565,
 u'slotting': 31206,
 u'warmongering': 35682,
 u'designing': 11878,
 u'3d00000433a006e2ba7': 2262,
 u'27a': 1707,
 u'crouch': 10856,
 u'9257': 4086,
 u'mailings': 21811,


### Naive Bayes Reporting

### Although Naive Bayes did not produce the most accurate or precise model compared to the SVC or SGD ones, it does allow for the engineer to see which words were assigned the highest log probability values. These values help determine which classification to give the email, so they can be interesting to review and give insight into the dataset. 

#### Note: You must run this for each category in order to see the log probability values.

In [76]:
# Iterate and sort values
tple_lst_business = []
for key, val in vocab.iteritems():
    #num = val
    log_prob = feature_log_prob_[0, val]
    tple = (key, log_prob)
    tple_lst_business.append(tple)
tple_lst_business.sort(key=lambda x: -x[1])

In [77]:
# Show sorted word output.
tple_lst_business

[(u'the', -5.2344390447471332),
 (u'to', -5.5326198225148131),
 (u'and', -5.9118128902144278),
 (u'ect', -5.9885707257438465),
 (u'of', -6.0590707933703145),
 (u'enron', -6.1221837255157627),
 (u'you', -6.1926160393274081),
 (u'in', -6.2502001944603141),
 (u'for', -6.3572072269973452),
 (u'is', -6.4207707714190683),
 (u'on', -6.4789012429524124),
 (u'that', -6.5070746354992046),
 (u'we', -6.5413514223621947),
 (u'com', -6.5448994667459468),
 (u'this', -6.5619840519833668),
 (u'hou', -6.6197132934115857),
 (u'have', -6.6640093676177647),
 (u'please', -6.6659553649840637),
 (u'be', -6.6688923358369445),
 (u'will', -6.6700940261297568),
 (u'with', -6.7273921653662683),
 (u'your', -6.7641778852759886),
 (u'at', -6.8037497118668808),
 (u'or', -6.8133979863195311),
 (u'if', -6.8234351516471161),
 (u'attached', -6.841712624004324),
 (u'from', -6.8681620399160481),
 (u'as', -6.871087667521163),
 (u'it', -6.8816331019500376),
 (u'me', -6.9042790027807319),
 (u'20', -6.9094433186336772),
 (u'200

In [78]:
# Iterate and sort values
tple_lst_cm = []
for key, val in vocab.iteritems():
    log_prob = feature_log_prob_[1, val]
    tple = (key, log_prob)
    tple_lst_cm.append(tple)
tple_lst_cm.sort(key=lambda x: -x[1])

In [79]:
tple_lst_cm

[(u'the', -9.2715167371058023),
 (u'you', -9.4911095288583631),
 (u'to', -9.5894030139134969),
 (u'your', -9.6740329583020266),
 (u'and', -9.7002726600961644),
 (u'com', -9.7100551505311508),
 (u'of', -9.7539218706547111),
 (u'mail', -9.7561803858792775),
 (u'boerne', -9.7700742963378069),
 (u'tourists', -9.7835471540967038),
 (u'mustard', -9.7946916397650021),
 (u'that', -9.7956851372571396),
 (u'it', -9.8296059607407518),
 (u'dagogo', -9.860192848397368),
 (u'rebels', -9.860192848397368),
 (u'kalu', -9.860192848397368),
 (u'men', -9.8607600376048996),
 (u'avoided', -9.8646085976637412),
 (u'who', -9.881056721130383),
 (u'interesting', -9.9070799781674506),
 (u'in', -9.9135297223382022),
 (u'they', -9.9537066050934317),
 (u'them', -9.9610203912215436),
 (u'diamond', -9.9687365549142903),
 (u'is', -9.9720260765370767),
 (u'for', -9.9758230513417079),
 (u'am', -9.9838465816707309),
 (u'this', -9.98951150049629),
 (u'are', -9.9982270398085902),
 (u'glad', -10.003077908166993),
 (u'not', 

In [None]:
# Iterate and sort values
tple_lst_imo = []
for key, val in vocab.iteritems():
    log_prob = feature_log_prob_[2, val]
    tple = (key, log_prob)
    tple_lst_imo.append(tple)
tple_lst_imo.sort(key=lambda x: -x[1])

In [None]:
tple_lst_imo

In [80]:
# Iterate and sort values
tple_lst_ga = []
for key, val in vocab.iteritems():
    log_prob = feature_log_prob_[3, val]
    tple = (key, log_prob)
    tple_lst_ga.append(tple)
tple_lst_ga.sort(key=lambda x: -x[1])

In [81]:
tple_lst_ga

[(u'20', -4.7810020020779875),
 (u'the', -4.9325254336238746),
 (u'and', -5.2304013315184568),
 (u'to', -5.3091571978161367),
 (u'enron', -5.364592331533772),
 (u'of', -5.5016847159071176),
 (u'will', -5.7089595216371176),
 (u'in', -5.7598478677892659),
 (u'for', -6.0143773222161423),
 (u'you', -6.183595529067909),
 (u'01', -6.2290358428208341),
 (u'your', -6.2848132508752146),
 (u'as', -6.3002473770226679),
 (u'on', -6.3408682594484311),
 (u'be', -6.3481403877438121),
 (u'our', -6.4252967405773944),
 (u'this', -6.4360917825275248),
 (u'we', -6.4498917156191613),
 (u'that', -6.4685134529503188),
 (u'is', -6.5381187220034409),
 (u'are', -6.5692915697662722),
 (u'business', -6.5865521519219143),
 (u'global', -6.5917612112146973),
 (u'new', -6.5928365306032397),
 (u'with', -6.625169566871258),
 (u'at', -6.6552918464898045),
 (u'markets', -6.7063688963457739),
 (u'have', -6.7152760811598968),
 (u'or', -6.7535632194169573),
 (u'has', -6.7697848638567875),
 (u'please', -6.7781797531648866),


In [None]:
# Iterate and sort values
tple_lst_hr = []
for key, val in vocab.iteritems():
    log_prob = feature_log_prob_[4, val]
    tple = (key, log_prob)
    tple_lst_hr.append(tple)
tple_lst_hr.sort(key=lambda x: -x[1])

In [None]:
tple_lst_hr

In [None]:
# Iterate and sort values
tple_lst_personal = []
for key, val in vocab.iteritems():
    log_prob = feature_log_prob_[5, val]
    tple = (key, log_prob)
    tple_lst_personal.append(tple)
tple_lst_personal.sort(key=lambda x: -x[1])

In [None]:
tple_lst_personal