In [1]:
from __future__ import print_function

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import string
printable = set(string.printable)

In [2]:
# load product descriptions
data = pd.read_pickle("./Data/abstracts")

data = data[["abstract"]].values.flatten().tolist()

clean_data = [filter(lambda x: x in printable, s) for s in data]

print(clean_data[0])

docs_raw = [i.decode('utf-8') for i in clean_data]


We have extensively mapped a sample of dense molecular clouds (L1512, TMC-1C,
L1262, Per 7, L1389, L1251E) in lines of HC3N, CH3OH, SO and C^{18}O. We
demonstrate that a high degree of chemical differentiation is present in all of
the observed clouds. We analyse the molecular maps for each cloud,
demonstrating a systematic chemical differentiation across the sample, which we
relate to the evolutionary state of the cloud. We relate our observations to
the cloud physical, kinematical and evolutionary properties, and also compare
them to the predictions of simple chemical models. The implications of this
work for understanding the origin of the clumpy structures and chemical
differentiation observed in dense clouds are discussed.


In [3]:
# Initilise CountVectorizer and fit to the descriptions data
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 1.0,
                                min_df = 100,
                                ngram_range = (2, 3))
dtm_tf = tf_vectorizer.fit_transform(docs_raw)

In [4]:
# Transform the product descpritons using term-frequency-inverse-document-frequency
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)

In [5]:
# Train LDA on the vectorised descriptions
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0, n_jobs=8,  max_iter=20, learning_method="online",batch_size=5000)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=5000, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_jobs=8, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [6]:
# plot the visualisation
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [7]:
# Train LDA using a higher number of topics
lda_tfidf_large = LatentDirichletAllocation(n_topics=25, random_state=0, n_jobs=8,  max_iter=20, learning_method="online", batch_size=5000)
lda_tfidf_large.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=5000, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_jobs=8, n_topics=25, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [8]:
# plot the visualisation
pyLDAvis.sklearn.prepare(lda_tfidf_large, dtm_tfidf, tfidf_vectorizer)

In [9]:
pyLDAvis.sklearn.prepare(lda_tfidf_large, dtm_tfidf, tfidf_vectorizer, mds='mmds')

In [10]:
# for TF DTM (not inverse document frequency)
lda_tf = LatentDirichletAllocation(n_topics=25, random_state=0, n_jobs=4,max_iter=20, learning_method="online",batch_size=5000)
lda_tf.fit(dtm_tf)

LatentDirichletAllocation(batch_size=5000, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_jobs=4, n_topics=25, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [11]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [12]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [13]:
lda_tfidf_large.fit(dtm_tf)

n_top_words = 4
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tfidf_large, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
large scale, early type, main sequence, scale structure
Topic #1:
power spectrum, microwave background, cosmic microwave, cosmic microwave background
Topic #2:
dark energy, lambda cdm, equation state, young stellar
Topic #3:
angular momentum, black hole, accretion rate, accretion disk
Topic #4:
low mass, emission line, emission lines, mass stars
Topic #5:
black hole, black holes, gravitational wave, supermassive black
Topic #6:
light curves, light curve, radial velocity, long term
Topic #7:
milky way, magnetic fields, power law, dwarf galaxies
Topic #8:
dark matter, standard model, cross section, cold dark
Topic #9:
monte carlo, low frequency, cosmological constant, type supernovae
Topic #10:
magnetic field, gamma ray, high energy, cosmic ray
Topic #11:
active galactic, galactic nuclei, active galactic nuclei, high redshift
Topic #12:
non thermal, radio emission, galactic center, time scale
Topic #13:
numerical simulations, luminosity function, spectral 

In [14]:
print type()

SyntaxError: invalid syntax (<ipython-input-14-e6ecb3b8531b>, line 1)