# `pyLDAvis.sklearn`

pyLDAvis now also supports LDA application from scikit-learn. Let's take a look into this in more detail. We will be using the 20 newsgroups dataset as provided by scikit-learn.

In [1]:
from __future__ import print_function

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from imp import reload


In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Load 20 newsgroups dataset

First, the 20 newsgroups dataset available in sklearn is loaded. As always, the headers, footers and quotes are removed.

In [5]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroups.data
print(len(docs_raw))

11314


## Convert to document-term matrix

Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form.

In [6]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(11314, 9144)


In [7]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)



(11314, 9144)


## Fit Latent Dirichlet Allocation models

Finally, the LDA models are fitted.

In [8]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

## Visualizing the models with pyLDAvis

In [14]:
a = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

print(json.dumps(str(a)))

daksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljda

  by='saliency', ascending=False).head(R).drop('saliency', 1)


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
6      0.080525  0.084967       1        1  10.698914
14     0.196207  0.062185       2        1  10.077951
0     -0.102869 -0.170439       3        1   9.104501
18     0.054788 -0.103025       4        1   7.036540
13     0.125707  0.045466       5        1   6.281324
8     -0.128318 -0.186213       6        1   5.411451
19    -0.012272 -0.152626       7        1   5.387479
12     0.094636  0.071910       8        1   5.377476
9     -0.124044  0.010609       9        1   4.547461
7     -0.030445 -0.011852      10        1   4.498512
5      0.142672  0.071664      11        1   4.342297
1     -0.061581 -0.055212      12        1   4.146615
15     0.069750 -0.060217      13        1   4.078443
2      0.110436 -0.003365      14        1   3.673889
11    -0.138638 -0.110874      15        1   3.391038
3     -0.176262 -0.024068      16        1   3.2400

In [23]:
a = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
with open('data_sklearn.json', 'w') as f:
    json.dump((str(a)), f)

daksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljda

  by='saliency', ascending=False).head(R).drop('saliency', 1)


### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [30]:
a = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')
data = (repr(a))
print(data)
with open('data_sklearn.json', 'w') as f:
    json.dump(data, f)

daksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljda

  by='saliency', ascending=False).head(R).drop('saliency', 1)


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
6     -0.099949 -0.213273       1        1  10.698914
14    -0.061287 -0.120387       2        1  10.077951
0     -0.204806  0.220991       3        1   9.104501
18    -0.248929 -0.049583       4        1   7.036540
13     0.130947  0.032302       5        1   6.281324
8     -0.286411  0.172089       6        1   5.411451
19    -0.357969 -0.001664       7        1   5.387479
12    -0.200551 -0.291682       8        1   5.377476
9      0.002478 -0.374588       9        1   4.547461
7      0.092395 -0.255058      10        1   4.498512
5      0.108063 -0.102476      11        1   4.342297
1      0.057300  0.198581      12        1   4.146615
15    -0.138635  0.048003      13        1   4.078443
2     -0.024170  0.006722      14        1   3.673889
11    -0.111985  0.340480      15        1   3.391038
3      0.082585  0.355001      16        1   3.2400



In [12]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

daksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljdaskdjaskjdaksjdaksdjaskdjdaksljda

  by='saliency', ascending=False).head(R).drop('saliency', 1)


ValueError: perplexity must be less than n_samples