In [1]:
import pandas as pd
import numpy as np
import mailbox
import email.utils

In [2]:
datapath = '/Users/mjschillawski/Desktop/Miscellaneous Data/'

In [3]:
files = ['Cornell Alum.mbox','Blog Emails.mbox','Cornell.mbox','Data.mbox','Go West, Young Man, Go West.mbox',
             'Promotions-Outdoor Retailer.mbox','Soccer.mbox','The Thread.mbox','Housing.mbox']

In [4]:
subjects = []
for f in files:
    mbox = mailbox.mbox(datapath+f)
    subject = []
    for message in mbox:
        try:
            s = message['Subject']
            subject.append(s)
        except:
            pass
    print(len(subject))
    subjects.append(subject)

29580
309
1892
1936
126
758
1025
3645
1453


In [5]:
data = subjects[0]
for i in range(1,len(subjects)):
    data = data + subjects[i]
    print(len(data))

29889
31781
33717
33843
34601
35626
39271
40724


In [6]:
len(data)

40724

In [7]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora,models
import pyLDAvis.gensim
import re

np.random.seed(20180313)
pyLDAvis.enable_notebook()

In [8]:
def clean_text(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    
    if pd.isnull(text):
        return []
    
    try:
        # remove garbage words (too long)
        if len(text) < 25:
            # break text into words
            tokenizer = RegexpTokenizer(r'\w+')
            processed_text = tokenizer.tokenize(text)

            # remove stop words
            processed_text = [text.lower() for text in processed_text if text.lower() 
                              not in stopwords.words('english')]

            # remove other email words
            processed_text = [text.lower() for text in processed_text if text.lower()
                              not in ('re','fwd','r','n','t')]

            # reduce words to stems
            porter_stemmer = PorterStemmer()
            processed_text = [porter_stemmer.stem(text) for text in processed_text]

            return processed_text
        else:
            return []
    except:
        return []

In [9]:
clean_subjects = [clean_text(d) for d in data]

In [10]:
clean_subjects = [cs for cs in clean_subjects if len(cs) > 0]

In [11]:
clean_subjects[0:10]

[['addit', 'packag'],
 ['addit', 'packag'],
 ['favor'],
 ['favor'],
 ['favor'],
 ['favor'],
 ['poll', 'resum'],
 ['poll', 'resum'],
 ['nice', 'tail', 'art'],
 ['nice', 'tail', 'art']]

In [12]:
dictionary = corpora.Dictionary(clean_subjects)
corpus = [dictionary.doc2bow(cs) for cs in clean_subjects]

lda = models.ldamodel.LdaModel(corpus,
                              id2word=dictionary,
                              num_topics=20,
                              passes=5,
                              minimum_probability=.05)

  result = psi(alpha) - psi(np.sum(alpha))
  return umr_maximum(a, axis, None, out, keepdims)


In [13]:
pyLDAvis.gensim.prepare(lda,corpus,dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


LinAlgError: Array must not contain infs or NaNs

In [None]:
doc_x_topic = np.array([[y for (x,y) in lda[corpus[i]]] for i in range(len(corpus))])

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42, perplexity=30)        # Instantiates Model
tsne_embedding = tsne.fit_transform(doc_x_topic)   # Fits Model/Transforms the Data
tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y']) # Creates Pandas Dataframe of TSNE results.
tsne_embedding['hue'] = doc_x_topic.argmax(axis=1) # Creates "hue" based on likeliest topic.

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

In [None]:

source = ColumnDataSource(
        data=dict(
            x = tsne_embedding.x,
            y = tsne_embedding.y,
            colors = [all_palettes['Category20'][20][i] for i in tsne_embedding.hue],
            topic = [i for i in tsne_embedding.hue],
            alpha = [0.7] * tsne_embedding.shape[0],
            size = [5] * tsne_embedding.shape[0]
        )
    )

hover_tsne = HoverTool(names=["train"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Topic:</span>
            <span style="font-size: 12px">@topic</span>
        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Newsgroups')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="train")

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    size = data['size']
    for (i = 0; i < x.length; i++) {
        alpha[i] = 0.7
        size[i] = 5
    }
    source.trigger('change');
""")

show(plot_tsne)