In [1]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from gensim.corpora import Dictionary

Считываем данные из файла. Заметим, что первое слово статьи соответсвует ее теме.

In [2]:
data_train = open('r8-train-stemmed.txt', 'r')

In [3]:
texts = []
themes = []

Извлекаем из статьи ее тему

In [4]:
for line in data_train:
    words = line.split()
    theme = words.pop(0)
    if theme not in themes:
        themes.append(theme)
    text = ' '.join(words)
    texts.append(text)

In [5]:
print "The themes are", themes
print "Total number of themes is", len(themes)

The themes are ['earn', 'acq', 'trade', 'ship', 'grain', 'crude', 'interest', 'money-fx']
Total number of themes is 8


Приводим данные к нужному виду.

In [6]:
vectorizer = CountVectorizer(stop_words='english', analyzer='word', min_df=2, max_df=0.4)
vec = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names()

In [7]:
id2word = dict([(word_id, word) for word_id, word in enumerate(vocab)])
corpus = matutils.Sparse2Corpus(vec, documents_columns=False)

Обучаем

In [8]:
start = time()
lda = LdaModel(corpus, id2word=id2word, num_topics=8, alpha='auto', update_every=1, passes=30)
finish = time()
print 'Training time: {}'.format((finish - start) / 60)

Training time: 4.68663599888


In [9]:
lda.print_topics()

[(0,
  u'0.095*ct + 0.056*record + 0.056*april + 0.052*dividend + 0.041*div + 0.036*pai + 0.032*prior + 0.031*march + 0.031*qtly + 0.026*set'),
 (1,
  u'0.107*ct + 0.107*loss + 0.105*net + 0.083*shr + 0.045*profit + 0.042*rev + 0.041*year + 0.038*qtr + 0.035*oper + 0.024*note'),
 (2,
  u'0.024*unit + 0.021*compani + 0.020*sale + 0.019*corp + 0.014*sell + 0.011*oper + 0.011*complet + 0.010*acquisit + 0.009*industri + 0.009*subsidiari'),
 (3,
  u'0.041*share + 0.026*compani + 0.022*offer + 0.018*pct + 0.014*stock + 0.013*corp + 0.012*group + 0.010*stake + 0.010*merger + 0.009*sharehold'),
 (4,
  u'0.029*trade + 0.015*japan + 0.010*offici + 0.010*japanes + 0.010*market + 0.008*foreign + 0.008*govern + 0.008*state + 0.007*countri + 0.007*dollar'),
 (5,
  u'0.059*stock + 0.042*split + 0.033*share + 0.023*compani + 0.017*common + 0.015*pct + 0.012*sharehold + 0.012*approv + 0.012*board + 0.011*dividend'),
 (6,
  u'0.045*oil + 0.022*price + 0.013*franc + 0.012*crude + 0.012*barrel + 0.012*ope

Сохраним модель.

In [10]:
lda.save('к8.lda')
#lda = gensim.models.LdaModel.load('newsgroups.lda')

1) Возьмем произвольный документ из обучающей выборки и посмотрим на качество машинной кластеризации.

In [11]:
#theme earn
exp1 = texts[54]
exp1 = exp1.split()
#theme crude
exp2 = texts[90]
exp2 = exp2.split()
#theme money-fx
exp3 = texts[93]
exp3 = exp3.split()

In [12]:
exp1_id2word = Dictionary()
exp1_id2word.merge_with(id2word)
exp2_id2word = Dictionary()
exp2_id2word.merge_with(id2word)
exp3_id2word = Dictionary()
exp3_id2word.merge_with(id2word)

<gensim.models.VocabTransform at 0xaffdd44c>

In [13]:
exp1 = exp1_id2word.doc2bow(exp1)
exp2 = exp2_id2word.doc2bow(exp2)
exp3 = exp3_id2word.doc2bow(exp3)

In [14]:
lda[exp1]

[(0, 0.87127952752497706), (2, 0.091186234223992607)]

In [15]:
lda[exp2]

[(4, 0.4728323268259027), (6, 0.19633799020593204), (7, 0.32906257540184952)]

In [16]:
lda[exp3]

[(3, 0.16403105064462001),
 (4, 0.26928216183096176),
 (6, 0.21371359151391128),
 (7, 0.34412049822864327)]

Качество неидельное, но сложно было ожидать чего-то выдающегося, когда все 8 тем настолько близки.

Теперь попробуем посмотреть качество на тестовой выборке.

In [17]:
#trade
exp1 = "japan warn retali trade disput japan warn unit state retaliatori measur unit state impos plan trade sanction april senior govern offici shinji fukukawa vice minist intern trade and industri ministri statement japan measur gener agreement tariff and trade and action unit state impos pct tariff japanes export plan week fukukawa japan readi continu trade talk unit state despit failur convinc america call threaten tariff two dai emerg talk end washington yesterdai last month presid reagan announc sanction retali for call japan failur honour juli agreement stop dump comput microchip market unit state and open home market american good fukukawa unit state had regrett not listen japan explan effort live pact and washington had not detail explan plan impos tariff reuter"

In [18]:
#earn
exp2 = "morgan crucibl prospect encourag morgan crucibl plc mgcr prospect for encourag order and sale significantli last year divis good opportun for growth exist and recent acquir busi for growth acquisit relat area earlier announc mln stg rise pre tax profit mln stg for year decemb turnov rose mln mln compani perform despit slowdown and australian economi half currenc fluctuat reduc pretax profit mln stg note morgan profit electron sector improv mln stg stg previous result nonetheless disappoint sale lower expect due delai defenc order and cancel had remedi action obtain busi and proceed deliveri major delai order morgan share firm two penc gmt fridai close reuter"

In [19]:
#money-fx
exp3 = "monei market deficit revis downward bank england revis estim todai shortfal mln stg mln take account mln stg morn assist reuter"

In [20]:
#interest
exp4 = "lawson countri cut rate nigel lawson britain chancellor exchequ countri cut interest rate aim maintain exchang rate stabil speak journalist dai group countri reaffirm goal set pari six week ago central bank continu interven and countri concern that japan stimul domest demand and welcom measur outlin japanes financ minist kiichi miyazawa yesterdai lawson worri risk simultan recess unit state japan and west germani gave march budget speech british parliament bit concern risk ask unit state increas interest rate support dollar for rel interest rate doesn rise interest rate unit state lawson concern express yesterdai meet slow progress unit state had made reduc budget deficit will worthwhil progress reduc deficit thi year import thing that continu year year lawson februari louvr accord call for effort stabil currenc current exchang rate six week that japanes yen continu rise dollar despit massiv central bank intervent ask whether thi intervent sign weak louvr accord don had intervent you call that sign weak intervent inflat lawson world not inflationari mode vigil yesterdai statement affirm that current level exchang rate had carefulli word mean and mean thing lawson financi market that japanes measur outlin louvr accord sourc weak for that agreement countri welcom miyazawa present plan for supplement budget stimul domest demand welcom goal increas public work spend lawson packag involv second stage increas expenditur second half thi year reuter"

In [21]:
exp1 = exp1.split()
exp2 = exp2.split()
exp3 = exp3.split()
exp4 = exp4.split()

In [22]:
exp1_id2word = Dictionary()
exp1_id2word.merge_with(id2word)
exp2_id2word = Dictionary()
exp2_id2word.merge_with(id2word)
exp3_id2word = Dictionary()
exp3_id2word.merge_with(id2word)
exp4_id2word = Dictionary()
exp4_id2word.merge_with(id2word)

<gensim.models.VocabTransform at 0xaffdde2c>

In [23]:
exp1 = exp1_id2word.doc2bow(exp1)
exp2 = exp2_id2word.doc2bow(exp2)
exp3 = exp3_id2word.doc2bow(exp3)
exp4 = exp4_id2word.doc2bow(exp4)

In [24]:
lda[exp1]

[(3, 0.02172657282124113), (4, 0.97383943951103702)]

In [25]:
lda[exp2]

[(2, 0.078983904807677016),
 (4, 0.033404170240665505),
 (6, 0.016800941863671648),
 (7, 0.86691158638031596)]

In [26]:
lda[exp3]

[(7, 0.96988722889239309)]

In [27]:
lda[exp4]

[(4, 0.84345780852819485), (7, 0.14697400655972523)]

Проблема все та же: все 8 тем довольно близки по смыслу + выборка довольно небольшая => Качество классификации оставляет желать лучшего.