In [1]:
import artm

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid", {'axes.grid' : False})


In [3]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
batch_vectorizer = artm.BatchVectorizer(data_path='output.txt', data_format='vowpal_wabbit',
                                        target_folder='artm-model/batches', batch_size=250)

In [6]:
T = 26   # количество тем
topic_names=["sbj"+str(i) for i in range(T-1)]+["bcg"]
# Ваш код
model = artm.ARTM(num_topics=T, topic_names=topic_names, num_processors=2, class_ids={'text':1, 'author':1},
                  reuse_theta=True, cache_theta=True)

In [7]:
np.random.seed(1)
dictionary = artm.Dictionary('dict')
dictionary.gather(batch_vectorizer.data_path)
model.initialize(dictionary=dictionary)

In [8]:
model.scores.add(artm.TopTokensScore(name='top_tokens_score_mod1', class_id='text', num_tokens=15))
model.scores.add(artm.TopTokensScore(name='top_tokens_score_mod2', class_id='author', num_tokens=15))

In [9]:
model.regularizers.add(artm.SmoothSparsePhiRegularizer(tau=1e5, class_ids='text', dictionary='dict', topic_names='bcg'))

In [10]:
model.num_document_passes = 1
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=26)

In [11]:
topic_names_cleared = list(topic_names).remove('bcg')
model.regularizers.add(artm.SmoothSparsePhiRegularizer(tau=-1e5, class_ids='text', dictionary='dict',
                                                       topic_names=topic_names_cleared))

In [12]:
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15)

In [13]:
tokens = model.score_tracker['top_tokens_score_mod1'].last_tokens
for topic_name in model.topic_names:
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print

sbj0:  скидка промокод акция товар май распродажа действовать купить июнь успевать оставаться участвовать последний бонус дарить
sbj1:  коляска игрушка одежда купить получать детский обувь распродажа скидка письмо рассылка подгузник смотреть питание автокресло
sbj2:  заказ доставка товар магазин оплата ваш москва наш информация заявка цена сумма номер получение адрес
sbj3:  москва код скидка малый получать письмо адрес головин час изменять товар частота суммироваться центр проходить
sbj4:  корзина магазин товар получать техника письмо покупка ребенок бытовой смотреть популярный новинка цена акция хотеть
sbj5:  корзина мебель рассылка новинка получать светильник цена являться доставка сообщение кровать удобный адрес распродажа поддержка
sbj6:  письмо товар получать каталог магазин россия вольт инструмент адрес рассылка цена кидка бесплатный сумка информация
sbj7:  пересылать весь адрес сайт служба размещать третий прямой отвечать доступ лицо сообщение личный находиться автоматически
sbj

In [14]:
authors = model.score_tracker['top_tokens_score_mod2'].last_tokens
for topic_name in model.topic_names:
    print topic_name + ': ',
    for author in authors[topic_name]:    
        print author,
    print

sbj0:  stolplit@e.stolplit.ru mvideo@sender.mvideo.ru 15311659480000002767 15313178070000002800 15240506190000001196 15242030470000001234 15244767860000001298 15247271910000001357 15247492390000001365 15248155090000001374 15239422690000001154 15247218210000001354 15244694930000001292 15311291750000002718 15311995750000002768
sbj1:  info@e.dochkisinochki.ru hello@sndr.babadu.ru news@list.komus.ru 15238924950000001149 15238928990000001150 15240377050000001189 15240563510000001199 15242128930000001240 15242897020000001260 15242916410000001261 15244868340000001303 15246708670000001348 15242044550000001236 15312389440000002783 15310326960000002706
sbj2:  noreply@pleer.ru info_zakaz@vstroykasolo.ru oldi@oldi.ru sales@regard.ru inform@list.komus.ru shop@euroset.ru robot@my-shop.ru mail@notik.ru info@wildberries.ru shop@bookvoed.ru support@mamsy.ru order@info.mediamarkt.ru no-reply@isoluxgroup.ru No-reply@ulmart.ru delivery@av.ru
sbj3:  news@laredoute.ru 15240456700000001192 152420917400000012

In [15]:
model.theta_columns_naming = "title" # включает именование столбцов Theta их названиями-ссылками, а не внутренними id 
# Ваш код
theta = model.get_theta()
print('Theta shape: %s' % str(theta.shape))
phi_a = model.get_phi(class_ids='author')
print('Phi_a shape: %s' % str(phi_a.shape))

Theta shape: (26, 49683)


ValueError: Empty data passed with indices specified.

In [16]:
theta

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
sbj0,0.0,0.0,0.4157799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.896982e-08,0.0,0.1097168,0.0342569,3.171464e-11,3.819872e-08,0.1089922,0.0,2.762699e-16,0.03744733
sbj1,0.258478,0.0,3.072495e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.793377e-05,3.129591e-11,8.292159e-13,4.419388e-09,6.752014e-16,5.739427e-15,5.440222e-13,4.616804e-08,4.594998e-09,0.926068
sbj2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0007047028,0.06367952,0.0,3.118431e-13,0.0,0.0,0.0004444825,0.3555622,0.000167146,1.453846e-11
sbj3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.37888e-12,3.652e-13,4.29711e-13,0.0,0.9656018,0.02232034,0.03888558,0.01131665,0.01761947
sbj4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.163078e-09,0.0002282637,5.01156e-16,6.664641e-12,5.840752e-14,3.961076e-13,0.00201176,2.991842e-13,0.0001221437,6.037805e-14
sbj5,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,...,0.0001907332,0.01004446,0.0,0.0,0.0,4.422189e-14,4.724057e-13,0.0,0.1100265,0.0
sbj6,1.739547e-12,0.0,0.0,0.0,2.570519e-13,0.0,0.0,0.0,0.259571,0.0,...,0.006692785,0.07557919,0.0,0.0001114346,3.79863e-14,2.021364e-06,0.001739774,2.195038e-13,0.02759598,5.213164e-10
sbj7,3.189521e-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01467585,0.001816383,0.0,6.96812e-13,0.0,0.0,0.0,0.007421751,0.01271592,1.172401e-09
sbj8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001912444,0.293231,0.0,6.156491e-11,0.0,0.0,1.029114e-07,8.838714e-10,0.06414693,0.0
sbj9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0007310075,0.1367714,0.0,3.765403e-13,1.512409e-12,2.234729e-05,7.968353e-05,0.0,0.0002692679,5.224376e-16


In [26]:
test_batch = artm.BatchVectorizer(data_path='test.txt', target_folder='test-model/batches', data_format='vowpal_wabbit', batch_size=250)
model.transform(batch_vectorizer=test_batch)

Unnamed: 0,15280932170000002084
sbj0,0.091728
sbj1,0.030733
sbj2,0.017385
sbj3,0.017467
sbj4,0.081108
sbj5,0.300645
sbj6,0.023495
sbj7,0.023186
sbj8,0.015929
sbj9,0.026914
