In [1]:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

In [2]:
from time import time
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
data, something = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Loading dataset...
done in 73.999s.


In [6]:
display(len(something))
display(data_samples[0])

11314

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [7]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf-idf features for NMF...
done in 0.466s.
Extracting tf features for LDA...
done in 0.417s.



In [19]:
display(tfidf.shape)
tfidf_vectorizer.vocabulary_

(2000, 1000)

{'sure': 881,
 'story': 867,
 'did': 286,
 'statement': 862,
 'media': 587,
 'pro': 717,
 'israeli': 488,
 'world': 988,
 'having': 432,
 'letter': 527,
 'try': 923,
 'think': 904,
 'reason': 748,
 'report': 764,
 'clearly': 206,
 'soldiers': 841,
 'received': 750,
 'government': 412,
 'makes': 567,
 'away': 130,
 'look': 548,
 'jews': 493,
 'got': 410,
 'power': 708,
 'expect': 345,
 'people': 676,
 'read': 744,
 'faq': 355,
 'actually': 71,
 'accept': 66,
 'hard': 429,
 'atheism': 121,
 'need': 624,
 'little': 540,
 'faith': 353,
 'runs': 784,
 'sorry': 843,
 'oh': 646,
 'just': 497,
 'end': 330,
 'maybe': 582,
 'start': 858,
 'new': 629,
 'newsgroup': 631,
 'alt': 86,
 'won': 979,
 'big': 142,
 'don': 306,
 'forget': 381,
 'points': 697,
 'like': 533,
 'know': 506,
 'ask': 117,
 'question': 735,
 'sort': 844,
 'countries': 247,
 'want': 956,
 'continue': 238,
 'israel': 487,
 'stop': 866,
 'asking': 119,
 'questions': 736,
 'work': 984,
 'bad': 132,
 'begin': 136,
 'attack': 122,
 '

In [22]:
display(tf.shape)

(2000, 1000)

In [None]:
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

# View the most common words for each of the topic

In [23]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.417s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp mail cc pub article information hope program mac email home contact blood
Topic #6: file problem files format win sound ftp pub read save sit

In [25]:
display(tfidf_feature_names[:10])
display(len(tfidf_feature_names))

['00', '000', '10', '100', '11', '12', '128', '13', '130', '14']

1000

In [28]:
tf_vectorizer.get_feature_names()[-15:]

['worked',
 'working',
 'works',
 'world',
 'worse',
 'worth',
 'wouldn',
 'write',
 'written',
 'wrong',
 'xfree86',
 'year',
 'years',
 'yes',
 'young']

# Using Kullblack - Leibler divergence

In [29]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...
done in 1.250s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: people don just like think did say time make know really right said things way ve course didn question probably
Topic #1: windows help thanks using hi looking info video dos pc does anybody ftp appreciated mail know advance available use card
Topic #2: god does jesus true book christian bible christians religion faith believe life church christ says know read exist lord people
Topic #3: thanks know bike interested mail like new car edu heard just price list email hear want cars thing sounds reply
Topic #4: 10 00 sale time power 12 new 15 year 30 offer condition 14 16 model 11 monitor 100 old 25
Topic #5: space government number public data states earth security water research nasa general 1993 phone information science technology provide blood internet
Topic #6: edu file com 

# LDA model - significantly long time

In [30]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 5.403s.

Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil i

There seems to be large overlaps in the words associated with each topic.

We have 2000 articles, and we have a vocabulary of 1000 words . 
We model that based on 10 topics

In [36]:
display(nmf.components_.shape)
display(len(data_samples))
display(tf.shape)
display(tfidf.shape)

(10, 1000)

2000

(2000, 1000)

(2000, 1000)

In [40]:
# distribution of words across topic
lda_normalized = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
lda_normalized.shape

(10, 1000)

In [46]:
# Top 10 largest in the normalize count
np.sort(lda_normalized[0])[::-1][:10]

array([0.05201296, 0.02330147, 0.01875768, 0.01785303, 0.01595098,
       0.0142104 , 0.01345274, 0.01195269, 0.01168612, 0.01058328])

In [50]:
# Distribution of topic
res = lda.transform(tf[0])
display(len(res[0]))
display(res)

10

array([[0.00344893, 0.6285982 , 0.00344908, 0.00344877, 0.00344865,
        0.34381098, 0.00344842, 0.00344869, 0.00344944, 0.00344884]])

# Conclusion
NMF and LDA are un-supervised methods, based on the frequency or the count of words across documents. 
LDA can output the distribution of topics for each documents