In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=20, max_df=.04)
vectorizer.fit(newsgroups_train.data)

X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 5877)

In [5]:
from tqdm import tqdm
def LDA (X, topics, a, b, n_iter ):
    n_kw = np.zeros((topics, X.shape[1]))
    n_dk = np.zeros((X.shape[0], topics))
    n_k = np.zeros(topics)

    docs, words = X.nonzero()
    z = np.random.choice(topics, len(docs))
    
    for word, doc, z_ in zip(words, docs, z):
        n_kw[z_,word] += 1
        n_dk[doc,z_] += 1
        n_k[z_] +=1
  
    for i in tqdm(range(n_iter)):
        for j in range(len(docs)):
            n_kw[z[j]][words[j]]-=1
            n_dk[docs[j]][z[j]]-=1
            n_k[z[j]]-=1
            
            p = (n_dk[docs[j], :] + a)*(n_kw[:, words[j]] + b[words[j]])/(n_k + b.sum())
            p /= p.sum()
            z[j] = np.random.choice(np.arange(topics), p = p)
            
            n_kw[z[j]][words[j]]+=1
            n_dk[docs[j]][z[j]]+=1
            n_k[z[j]]+=1
            
    return n_kw, n_dk, n_k, z


topics = 20
n_kw, n_dk, n_k, z = LDA( X_train,topics,  1*np.ones(topics), 1*np.ones(X_train.shape[1]), 50)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [33:36<00:00, 39.39s/it]


Top-10 words in 20 topics:

In [7]:
top_10 = np.argsort(n_kw, axis=1)[:, :-11:-1]

for t in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_10[t]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(t, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	control	crime	gun	guns	law	laws	police	rights	self	weapons
Topic 1:	1993	april	contact	major	national	press	research	states	university	washington
Topic 2:	anybody	couldn	deleted	guess	nice	oh	says	sorry	sounds	stuff
Topic 3:	card	computer	disk	dos	mac	memory	monitor	pc	speed	video
Topic 4:	country	israel	israeli	jewish	jews	killed	land	peace	today	war
Topic 5:	bible	christ	christian	christians	death	jesus	john	love	man	says
Topic 6:	game	games	hockey	league	play	players	season	team	teams	win
Topic 7:	11	12	13	14	16	17	18	23	24	25
Topic 8:	100	asking	buy	condition	offer	original	price	sale	sell	shipping
Topic 9:	cause	common	disease	effect	experience	large	results	similar	small	usually
Topic 10:	came	days	happened	home	left	saw	started	told	took	went
Topic 11:	article	change	comes	couple	doubt	hear	ok	simply	unless	wonder
Topic 12:	cheers	comes	couple	exactly	goes	guess	haven	net	sort	wouldn
Topic 13:	application	code	file	files	ftp	running	server	user	version	window
Topic 14:	

In [8]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

Можно соотнести полученные топики с топиками из датасета. Так например, 8 полученный топик явно относится к хоккею, 17 топик возможно к космосу, 3, 13 и 14 к электронике и ОС, 16 к машинам, 5 к религии