In [1]:
import artm
import os
import glob
import sys

In [2]:
sys.path.append("D:/PTM/code/BigAltHier")

In [86]:
import hierarchy_utils
reload(hierarchy_utils)

<module 'hierarchy_functions' from 'D:/PTM/code/BigAltHier\hierarchy_functions.py'>

In [57]:
data_path_UCI = 'ngramm/pymorphy/UCI_trimmed/'
data_path_batches = 'ngramm/pymorphy/batches/'
if len(glob.glob(data_path_batches + "/*.batch")) < 1:
    batch_vectorizer = artm.BatchVectorizer(collection_name='postnauka', data_format='bow_uci', \
                data_path = data_path_UCI, batch_size = 200, target_folder=data_path_batches)
else:
    batch_vectorizer = artm.BatchVectorizer(data_path=data_path_batches, data_format='batches')

In [6]:
def get_topic_names(topicNum):
    n_bckgrnd = 0
    n_subj = topicNum-n_bckgrnd
    topics_bckgrnd = [u'background_'+str(i).zfill(3) for i in range(n_bckgrnd)]
    topics_subj = [u'subj_'+str(i).zfill(3) for i in range(n_subj)]
    topic_names = topics_bckgrnd + topics_subj
    return topic_names

In [7]:
class_ids={'ngramm':1.0, 'projects':1.0, 'category':1, 'author': 5.0, 'post_tag':5.0}

In [10]:
regularizers_list = []

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiCatReg',class_ids=['category'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiAuthorReg',class_ids=['author'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiTagReg',class_ids=['post_tag'],tau=0.05))
regularizers_list.append(artm.DecorrelatorPhiRegularizer(name='DecorrPhiReg',class_ids=['ngramm'],tau=50000))

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiNgrammRegSubj',
                                                         class_ids=['ngramm'],
                                                         #topic_names=topics_subj,
                                                         tau=0.01))
regularizers_list.append(artm.SmoothSparseThetaRegularizer(name='SmoothThetaRegSubj',
                                                           #topic_names=topics_subj,
                                                           tau=0.01))

In [13]:
# quality measures of models
scores_list = []
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreNgrammSubj',
                                         class_id='ngramm',
                                         #topic_names=topics_subj
                                        ))
scores_list.append(artm.SparsityThetaScore(name='SparsityScoreThetaSubj',
                                           #topic_names=topics_subj
                                          ))

scores_list.append(artm.SparsityPhiScore(name='SparsityScoreCat', class_id='category'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreAuthor', class_id='author'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreTag', class_id='post_tag'))
scores_list.append(artm.TopTokensScore(name='TopTokensScoreNgramm', class_id='ngramm', num_tokens=10))
scores_list.append(artm.TopTokensScore(name='TopTokensTag', class_id='post_tag', num_tokens=10))
scores_list.append(artm.TopicKernelScore(name='TopicKernelNgramm', class_id='ngramm', probability_mass_threshold=0.25))
scores_list.append(artm.PerplexityScore(name='PerplexityScore', class_ids=['ngramm']))

In [58]:
dict_path = 'ngramm/pymorphy/UCI_trimmed/dictionary.txt'
if not os.path.isfile(dict_path):
    dictionary = artm.Dictionary('dictionary')
    dictionary.gather(batch_vectorizer.data_path)
    dictionary.save_text(dict_path)
else:
    dictionary = artm.Dictionary('dictionary')
    dictionary.load_text(dict_path)

In [59]:
topicNum0 = 11
level0 = artm.ARTM(num_topics=topicNum0,
                  topic_names=get_topic_names(topicNum0),
                  class_ids=class_ids,
                  regularizers=regularizers_list,
                  scores=scores_list)
level0.initialize(dictionary=dictionary)

In [60]:
level0.fit_offline(batch_vectorizer, num_collection_passes=30)

In [61]:
tokens = level0.score_tracker["TopTokensTag"].last_tokens
for topic_name in level0.topic_names:
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print

subj_000:  медицина мозг биология нейробиология нейрон психология онкология нейрофизиология биомедицина старение
subj_001:  биология эволюция психология антропология человек когнитивная_психология мышление палеонтология этология антропогенез
subj_002:  экономика Россия политика общество государство США политология СССР социология история
subj_003:  история культура история_России Средневековье христианство Россия религия Европа Русь Франция
subj_004:  язык лингвистика филология культура русский_язык литература фольклор литературоведение мифология фольклористика
subj_005:  астрономия астрофизика звезды космос Вселенная галактика химия Земля черные_дыры Солнце
subj_006:  образование наука история_науки университет культура Россия философия математика школа кино
subj_007:  физика физика_элементарных_частиц квантовая_физика элементарная_частица кварк бозон_Хиггса атом Большой_адронный_коллайдер Стандартная_модель космология
subj_008:  социология философия культура общество город культуроло

In [87]:
topicNum1 = 50
level1 = hierarchy_utils.ARTM_Level(level0, phi_batch_weight=10.0**3, 
                                        num_topics=topicNum1, 
                                        topic_names=get_topic_names(topicNum1), 
                                         class_ids=class_ids,
                                          regularizers=regularizers_list,
                                           scores=scores_list,
                                            num_document_passes=1, 
                                             cache_theta=True, seed=124)

In [88]:
level1.initialize(dictionary=dictionary)
level1.fit_offline(batch_vectorizer, num_collection_passes=30)

In [89]:
tokens = level1.score_tracker["TopTokensTag"].last_tokens
for topic_name in level1.topic_names:
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print

subj_000:  Вселенная физика Земля кварк астрофизика гравитация нейтрино темная_материя космос антиматерия
subj_001:  Россия США общество принятие_решений клиодинамика психология_масс кооперация поведенческая_экономика идентичность гендер
subj_002:  история Средневековье христианство история_России Русь ислам католицизм Скандинавия Украина Петр_I
subj_003:  культура искусство фольклор фольклористика искусствоведение театр театроведение авангард народная_культура мода
subj_004:  образование наука университет школа академическая_среда ЕГЭ история_науки Гумбольдт_Вильгельм Российская_академия_наук социология_образования
subj_005:  философия массовая_культура культурология история_философии кинематограф кино культура аналитическая_философия Фуко_Мишель русская_философия
subj_006:  химия Солнечная_система экзопланета Земля планета Марс климат метеорит атмосфера марсоход
subj_007:  физика элементарная_частица бозон_Хиггса Стандартная_модель Большой_адронный_коллайдер физика_элементарных_части

In [90]:
psi = level1.get_psi()

In [91]:
psi

Unnamed: 0,subj_000,subj_001,subj_002,subj_003,subj_004,subj_005,subj_006,subj_007,subj_008,subj_009,subj_010
subj_000,0.005378,0.008814,0.004411,0.004161,0.004542,0.113191,0.003239,0.056206,0.004229,0.004498,0.010606
subj_001,0.006478,0.007936,0.039409,0.008202,0.005215,0.00481,0.060375,0.005452,0.010159,0.027551,0.003829
subj_002,0.003547,0.005197,0.005098,0.149823,0.005767,0.003573,0.005198,0.004743,0.00543,0.00299,0.002898
subj_003,0.005949,0.006615,0.005992,0.009302,0.102447,0.005545,0.01116,0.00527,0.011118,0.008917,0.003796
subj_004,0.005728,0.004934,0.007335,0.002803,0.005423,0.003017,0.168576,0.003084,0.00566,0.024998,0.002176
subj_005,0.002666,0.002522,0.004743,0.009786,0.013155,0.002321,0.111549,0.001943,0.054946,0.002749,0.001864
subj_006,0.008604,0.007769,0.004895,0.004822,0.005679,0.136953,0.003622,0.008023,0.004732,0.0059,0.011528
subj_007,0.006866,0.005618,0.005142,0.005128,0.004413,0.009987,0.003062,0.158651,0.006046,0.006082,0.008093
subj_008,0.005311,0.007002,0.182521,0.016047,0.005972,0.003403,0.003446,0.004341,0.011553,0.002874,0.002545
subj_009,0.014612,0.184755,0.009129,0.007665,0.00625,0.005461,0.004173,0.005102,0.008848,0.004752,0.004234


In [97]:
tokens0 = level0.score_tracker["TopTokensTag"].last_tokens
tokens1 = level1.score_tracker["TopTokensTag"].last_tokens
for t, topic_name in enumerate(level0.topic_names):
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print
    for s, topic_name1 in enumerate(level1.topic_names):
        if psi.values[s, t] > 0.05:
            print "\t", topic_name1 + ': ',
            for word in tokens[topic_name1]:    
                print word,
            print  

subj_000:  Вселенная физика Земля кварк астрофизика гравитация нейтрино темная_материя космос антиматерия
	subj_017:  мозг нейробиология память нейрон психология стресс сознание сон нейрофизиология неврология
	subj_023:  медицина биология онкология биомедицина клетка иммунитет генетика стволовые_клетки кардиология рак
	subj_025:  биология человек медицина физиология психология психика биоэтика мотивация питание биомедицина
	subj_040:  нейрофизиология нейрон мозг нейропсихология эмоции нейробиология музыка психофизиология нервная_система синапс
	subj_043:  медицина старение биофизика сознание философия_сознания биомедицина биология кардиология смерть иммунология
subj_001:  Россия США общество принятие_решений клиодинамика психология_масс кооперация поведенческая_экономика идентичность гендер
	subj_009:  антропология психология когнитивная_психология человек мышление расоведение антропогенез внимание память психология_мышления
	subj_027:  биология эволюция этология антропология зоология 