In [1]:
import numpy as np
import pandas as pd
import re
import gensim
from gensim import corpora, models, similarities

In [4]:
emails = pd.read_csv('Emails.csv')
emails = emails[['Id', 'ExtractedBodyText']].dropna()

In [10]:
# clean the email texts
def clean_text(text):
    """ Clean the text of each email """
    text = text.replace('\n'," ") #remove line break
    text = re.sub(r"-", " ", text) #replace hypens with space
    text = re.sub(r"\d+/\d+/\d+", "", text) #remove date
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) #remove times
    text = re.sub(r"[\w]+@[\.\w]+", "", text) #remove email addresses
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) #removes web addresses
    clndoc = ''
    for letter in text:
        if letter.isalpha() or letter==' ':
            clndoc+=letter
    text = ' '.join(word for word in clndoc.split() if len(word)>1)
    return text    

In [11]:
email_body = emails.ExtractedBodyText.apply(clean_text)

In [12]:
email_body.head()

1    Thursday March PM Latest How Syria is aiding Q...
2                                                  Thx
4    Friday March PM Huma Abedin Fw Latest How Syri...
5    Pis print Wednesday September PM Fw Meet The R...
7    Friday March PM Huma Abedin Fw Latest How Syri...
Name: ExtractedBodyText, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
stopwords = tfidf.get_stop_words()

In [9]:
email_text_standardize = lambda email: [word for word in email.lower().split() if word not in stopwords]
email_texts = map(email_text_standardize, email_body.values)
                  

In [16]:
dictionary = corpora.Dictionary(email_texts)
corpus = [dictionary.doc2bow(email) for email in email_texts]

In [36]:
def build_topic_model(num_topics):
    email_lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=5)
    print email_lda.show_topics()
    return email_lda

In [37]:
import pyLDAvis.gensim 
pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(build_topic_model(5), corpus, dictionary)

[(0, u'0.005*"state" + 0.005*"people" + 0.004*"new" + 0.004*"government" + 0.004*"security" + 0.004*"american" + 0.004*"states" + 0.003*"united" + 0.003*"said" + 0.003*"work"'), (1, u'0.033*"fyi" + 0.025*"pm" + 0.017*"pls" + 0.016*"ok" + 0.012*"print" + 0.012*"fw" + 0.012*"cheryl" + 0.009*"huma" + 0.009*"mills" + 0.009*"thx"'), (2, u'0.070*"pm" + 0.037*"office" + 0.028*"secretarys" + 0.021*"meeting" + 0.019*"room" + 0.014*"state" + 0.013*"department" + 0.013*"arrive" + 0.013*"route" + 0.012*"en"'), (3, u'0.009*"just" + 0.009*"know" + 0.008*"state" + 0.008*"im" + 0.007*"tomorrow" + 0.006*"today" + 0.006*"good" + 0.006*"like" + 0.006*"talk" + 0.006*"want"'), (4, u'0.007*"obama" + 0.006*"said" + 0.006*"mr" + 0.006*"party" + 0.005*"president" + 0.005*"new" + 0.004*"white" + 0.004*"house" + 0.003*"republican" + 0.003*"senate"')]


In [38]:
pyLDAvis.gensim.prepare(build_topic_model(10), corpus, dictionary)

[(0, u'0.015*"mr" + 0.011*"said" + 0.009*"obama" + 0.007*"president" + 0.007*"new" + 0.007*"party" + 0.005*"white" + 0.005*"david" + 0.005*"clinton" + 0.005*"secretary"'), (1, u'0.034*"pls" + 0.023*"print" + 0.016*"doc" + 0.014*"state" + 0.013*"pis" + 0.010*"huma" + 0.010*"abedin" + 0.010*"date" + 0.008*"press" + 0.008*"add"'), (2, u'0.006*"republican" + 0.006*"senate" + 0.005*"obama" + 0.005*"care" + 0.005*"democrats" + 0.005*"republicans" + 0.005*"health" + 0.004*"right" + 0.004*"party" + 0.004*"vote"'), (3, u'0.083*"pm" + 0.047*"office" + 0.034*"secretarys" + 0.026*"meeting" + 0.024*"room" + 0.020*"state" + 0.017*"department" + 0.015*"arrive" + 0.015*"route" + 0.015*"en"'), (4, u'0.017*"israel" + 0.014*"israeli" + 0.008*"jewish" + 0.007*"netanyahu" + 0.007*"palestinian" + 0.006*"peace" + 0.006*"deal" + 0.005*"said" + 0.005*"minister" + 0.005*"palestinians"'), (5, u'0.037*"pm" + 0.015*"cheryl" + 0.014*"fw" + 0.013*"sunday" + 0.012*"mills" + 0.012*"sullivan" + 0.011*"monday" + 0.011*"

In [39]:
pyLDAvis.gensim.prepare(build_topic_model(15), corpus, dictionary)

[(3, u'0.009*"obama" + 0.007*"american" + 0.007*"president" + 0.006*"new" + 0.006*"said" + 0.005*"war" + 0.005*"afghanistan" + 0.005*"military" + 0.005*"political" + 0.004*"administration"'), (0, u'0.024*"print" + 0.023*"pls" + 0.014*"pis" + 0.011*"qddr" + 0.007*"speech" + 0.007*"sbwhoeop" + 0.006*"new" + 0.006*"read" + 0.006*"thx" + 0.005*"add"'), (2, u'0.012*"state" + 0.009*"united" + 0.009*"states" + 0.008*"security" + 0.006*"people" + 0.006*"government" + 0.005*"diplomacy" + 0.005*"department" + 0.005*"secretary" + 0.005*"nuclear"'), (1, u'0.012*"know" + 0.007*"let" + 0.006*"late" + 0.005*"food" + 0.005*"good" + 0.005*"statement" + 0.005*"going" + 0.004*"iii" + 0.004*"plane" + 0.004*"thx"'), (10, u'0.019*"percent" + 0.015*"israeli" + 0.015*"israel" + 0.011*"party" + 0.010*"labour" + 0.008*"poll" + 0.008*"jewish" + 0.008*"palestinian" + 0.007*"obama" + 0.007*"voters"'), (7, u'0.093*"pm" + 0.049*"office" + 0.037*"secretarys" + 0.027*"meeting" + 0.026*"room" + 0.024*"state" + 0.018*"d

In [40]:
pyLDAvis.gensim.prepare(build_topic_model(20), corpus, dictionary)

[(6, u'0.039*"pm" + 0.038*"pls" + 0.027*"print" + 0.026*"cheryl" + 0.024*"fw" + 0.019*"mills" + 0.018*"pis" + 0.015*"sullivan" + 0.013*"fyi" + 0.013*"thx"'), (18, u'0.050*"release" + 0.020*"kurdistan" + 0.018*"melanne" + 0.015*"verveer" + 0.013*"sent" + 0.012*"blackberry" + 0.010*"sudan" + 0.010*"astoria" + 0.010*"arturo" + 0.009*"mail"'), (12, u'0.014*"mcchrystal" + 0.010*"taliban" + 0.010*"afghanistan" + 0.008*"said" + 0.008*"bloomberg" + 0.007*"al" + 0.006*"troops" + 0.006*"mayor" + 0.006*"holbrooke" + 0.005*"military"'), (4, u'0.018*"message" + 0.016*"secretary" + 0.014*"memo" + 0.010*"statement" + 0.010*"hillary" + 0.009*"copy" + 0.009*"sheet" + 0.009*"letter" + 0.009*"lauren" + 0.008*"follow"'), (8, u'0.017*"diplomacy" + 0.010*"state" + 0.009*"development" + 0.009*"treaty" + 0.007*"department" + 0.006*"states" + 0.005*"marie" + 0.005*"policy" + 0.005*"postconflict" + 0.005*"officials"'), (17, u'0.009*"people" + 0.008*"security" + 0.007*"united" + 0.007*"states" + 0.006*"work" + 0