<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Topic Modeling</H1></u></center>

## Latent Dirichlet Allocation (LDA)

In [1]:
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [3]:
categories = ['rec.autos', 'sci.electronics', 'comp.graphics']
data = fetch_20newsgroups(subset='all', categories=categories,
                          shuffle=True, random_state=2017)

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data.data)

In [5]:
topics = 3
lda = LatentDirichletAllocation(n_topics = topics,
                                max_iter=100,
                               learning_method='online',
                               learning_offset=50.,
                               random_state=2017)

In [6]:
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_jobs=1, n_topics=3, perp_tol=0.1, random_state=2017,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [7]:
feature_names = np.array(vectorizer.get_feature_names())
feature_names

array([u'00', u'000', u'0000', ..., u'\xaal', u'\xb3ation', u'\xfd\xe9'],
      dtype='<U81')

In [8]:
lda.components_

array([[ 6.67772719,  8.8261096 ,  0.40601553, ...,  0.43649435,
         0.65684313,  0.65684313],
       [ 0.3341942 ,  0.33412802,  0.33416326, ...,  0.33448886,
         0.33440928,  0.33440928],
       [ 0.33419323,  0.33412709,  0.33416216, ...,  0.33448744,
         0.334408  ,  0.334408  ]])

In [9]:
for idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic #0:
edu com subject lines car organization university writes article posting
Topic #1:
eehp22 hl7204 gstrf choung qedbbs qed lampman howlin unlv goofy
Topic #2:
aantal_snijpunten vinge wessels cornelis vector2d _the atr bso clibsucces puntbinnenpolygoon


## Non-negative Matrix Factorization

In [10]:
from sklearn.decomposition import NMF

In [11]:
nmf = NMF(n_components=topics,
         random_state=2017,
         alpha=.1,
         l1_ratio=.5)

In [12]:
nmf.fit(X)

NMF(alpha=0.1, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=3, nls_max_iter=2000, random_state=2017, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [13]:
for idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic #0:
edu com car article writes lines subject just organization like
Topic #1:
image graphics bit file images uk ac jpeg 24 files
Topic #2:
uiuc cso opel illinois manta uxa urbana cka52397 edu oriolefan


## Reference:

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html