In [1]:
import artm
import os
import glob
import sys

In [2]:
sys.path.append("D:/PTM/code/BigAltHier")

In [31]:
import hierarchy_functions
reload(hierarchy_functions)

<module 'hierarchy_functions' from 'D:/PTM/code/BigAltHier\hierarchy_functions.py'>

In [57]:
data_path_UCI = 'ngramm/pymorphy/UCI_trimmed/'
data_path_batches = 'ngramm/pymorphy/batches/'
if len(glob.glob(data_path_batches + "/*.batch")) < 1:
    batch_vectorizer = artm.BatchVectorizer(collection_name='postnauka', data_format='bow_uci', \
                data_path = data_path_UCI, batch_size = 200, target_folder=data_path_batches)
else:
    batch_vectorizer = artm.BatchVectorizer(data_path=data_path_batches, data_format='batches')

In [6]:
def get_topic_names(topicNum):
    n_bckgrnd = 0
    n_subj = topicNum-n_bckgrnd
    topics_bckgrnd = [u'background_'+str(i).zfill(3) for i in range(n_bckgrnd)]
    topics_subj = [u'subj_'+str(i).zfill(3) for i in range(n_subj)]
    topic_names = topics_bckgrnd + topics_subj
    return topic_names

In [7]:
class_ids={'ngramm':1.0, 'projects':1.0, 'category':1, 'author': 5.0, 'post_tag':5.0}

In [10]:
regularizers_list = []

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiCatReg',class_ids=['category'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiAuthorReg',class_ids=['author'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiTagReg',class_ids=['post_tag'],tau=0.05))
regularizers_list.append(artm.DecorrelatorPhiRegularizer(name='DecorrPhiReg',class_ids=['ngramm'],tau=50000))

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiNgrammRegSubj',
                                                         class_ids=['ngramm'],
                                                         #topic_names=topics_subj,
                                                         tau=0.01))
regularizers_list.append(artm.SmoothSparseThetaRegularizer(name='SmoothThetaRegSubj',
                                                           #topic_names=topics_subj,
                                                           tau=0.01))

In [13]:
# quality measures of models
scores_list = []
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreNgrammSubj',
                                         class_id='ngramm',
                                         #topic_names=topics_subj
                                        ))
scores_list.append(artm.SparsityThetaScore(name='SparsityScoreThetaSubj',
                                           #topic_names=topics_subj
                                          ))

scores_list.append(artm.SparsityPhiScore(name='SparsityScoreCat', class_id='category'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreAuthor', class_id='author'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreTag', class_id='post_tag'))
scores_list.append(artm.TopTokensScore(name='TopTokensScoreNgramm', class_id='ngramm', num_tokens=10))
scores_list.append(artm.TopTokensScore(name='TopTokensTag', class_id='post_tag', num_tokens=10))
scores_list.append(artm.TopicKernelScore(name='TopicKernelNgramm', class_id='ngramm', probability_mass_threshold=0.25))
scores_list.append(artm.PerplexityScore(name='PerplexityScore', class_ids=['ngramm']))

In [58]:
dict_path = 'ngramm/pymorphy/UCI_trimmed/dictionary.txt'
if not os.path.isfile(dict_path):
    dictionary = artm.Dictionary('dictionary')
    dictionary.gather(batch_vectorizer.data_path)
    dictionary.save_text(dict_path)
else:
    dictionary = artm.Dictionary('dictionary')
    dictionary.load_text(dict_path)

In [59]:
topicNum0 = 11
model = artm.ARTM(num_topics=topicNum0,
                  topic_names=get_topic_names(topicNum0),
                  class_ids=class_ids,
                  regularizers=regularizers_list,
                  scores=scores_list)
model.initialize(dictionary=dictionary)

In [60]:
model.fit_offline(batch_vectorizer, num_collection_passes=30)

In [61]:
tokens = model.score_tracker["TopTokensTag"].last_tokens
for topic_name in model.topic_names:
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print

subj_000:  медицина мозг биология нейробиология нейрон психология онкология нейрофизиология биомедицина старение
subj_001:  биология эволюция психология антропология человек когнитивная_психология мышление палеонтология этология антропогенез
subj_002:  экономика Россия политика общество государство США политология СССР социология история
subj_003:  история культура история_России Средневековье христианство Россия религия Европа Русь Франция
subj_004:  язык лингвистика филология культура русский_язык литература фольклор литературоведение мифология фольклористика
subj_005:  астрономия астрофизика звезды космос Вселенная галактика химия Земля черные_дыры Солнце
subj_006:  образование наука история_науки университет культура Россия философия математика школа кино
subj_007:  физика физика_элементарных_частиц квантовая_физика элементарная_частица кварк бозон_Хиггса атом Большой_адронный_коллайдер Стандартная_модель космология
subj_008:  социология философия культура общество город культуроло

In [62]:
topicNum1 = 50
level1 = hierarchy_functions.ARTM_Level(model, phi_batch_weight=10**3, 
                                        num_topics=topicNum1, 
                                        topic_names=get_topic_names(topicNum1), 
                                         class_ids=class_ids,
                                          regularizers=regularizers_list,
                                           scores=scores_list,
                                            num_document_passes=1, 
                                             cache_theta=True, seed=124)

In [63]:
level1.initialize(dictionary=dictionary)
level1.fit_offline(batch_vectorizer, num_collection_passes=30)

In [48]:
tokens = level1.score_tracker["TopTokensTag"].last_tokens
for topic_name in level1.topic_names:
    print topic_name + ': ',
    for word in tokens[topic_name]:    
        print word,
    print

subj_000:  геология социология_права палеоботаника логика компьютерное_моделирование судопроизводство ботаника полиция судебная_система сейсмология
subj_001:  математика сон геометрия топология кибернетика алгебра нейрон статистика сомнология атомная_энергетика
subj_002:  история история_России СССР историография Первая_мировая_война Вторая_мировая_война культурная_политика Великая_Отечественная_война Древняя_Ирландия империя
subj_003:  психология литература фольклор когнитивная_психология литературоведение фольклористика внимание народная_культура сказка зрение
subj_004:  информационные_технологии информатика computer_science интеллект социальные_сети информационная_безопасность атмосфера машинное_обучение хранение_данных компьютерные_сети
subj_005:  массовая_культура культура культурология кино гендер кинематограф феминизм постмодернизм паракинематограф Жижек_Славой
subj_006:  звезды астрономия галактика астрофизика черные_дыры нейтронные_звезды космос звездное_скопление радиотелеско

In [51]:
level1.cache_theta = False

In [52]:
psi = level1.get_psi()

In [53]:
psi

Unnamed: 0,subj_000,subj_001,subj_002,subj_003,subj_004,subj_005,subj_006,subj_007,subj_008,subj_009,subj_010
subj_000,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_001,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_002,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_003,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_004,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_005,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_006,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_007,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_008,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
subj_009,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
