# NumTopics: all datasets

In [1]:
%load_ext autoreload
%autoreload 2

import time
import sys

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
sys.path.insert(0, '../..')
from topicnet.cooking_machine.models import TopicModel
from topicnet.cooking_machine.dataset import Dataset

In [3]:

sys.path.insert(0, '..')

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.search_methods.optimize_scores_method import OptimizeScoresMethod

from topnum.utils import (
    read_corpus_config, split_into_train_test, 
    build_every_score, monotonity_and_std_analysis, 
    trim_config, plot_everything_informative
)


from topnum.model_constructor import KnownModel, PARAMS_EXPLORED


In [4]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


## Experiments

In [10]:
import os, glob
configs_dir = os.path.join('..', 'topnum', 'configs')
configs_mask = os.path.join(configs_dir, '*.yml')


for config_file in glob.glob(configs_mask):
    config = read_corpus_config(config_file)
    print(config['name'], config['num_restarts'])
    numlines = 0
    with open(config['dataset_path'], 'r') as f:
        for line in f:
            numlines += 1
    print(numlines)


StackOverflow 6
895621
RuWikiGood 4
8603
Brown 6
89830
Reuters 6
202715
WikiRef220 6
221
PostNauka 6
102428
20NewsGroups 6
616191


In [9]:
config

OrderedDict([('name', '20NewsGroups'),
             ('batches_prefix', '20NG'),
             ('dataset_path', '/data/datasets/20_News_dataset/20NG_BOW.csv'),
             ('word', '@word'),
             ('min_num_topics', 10),
             ('max_num_topics', 30),
             ('num_topics_interval', 3),
             ('num_fit_iterations', 40),
             ('num_restarts', 6)])

In [6]:
!ls $configs_dir

20NG.yml   PN.yml	ruwikigood.yml	WikiRef220.yml
Brown.yml  Reuters.yml	SO.yml


In [7]:
# for debug

config = read_corpus_config(configs_dir + "/ruwikigood.yml")

config['num_restarts'] = 3




In [8]:
dataset = Dataset(
    config['dataset_path'], 
    internals_folder_path=f'/home/vbulatov/Projects/OptimalNumberOfTopics/demos/{config["batches_prefix"]}_internals'
)

train_dataset, test_dataset = split_into_train_test(dataset, config)

text_collection = VowpalWabbitTextCollection.from_dataset(train_dataset, main_modality=config['word'])


In [10]:
import itertools


In [11]:
for model_family in KnownModel:
    template = PARAMS_EXPLORED[model_family]

    the_grid = [
            [[key, one_value] for one_value in template[key]]
            for key, params in template.items()
    ]
    for idx, model_params in enumerate(itertools.product(*the_grid)):
        # print(dict(zip(*model_params)))
        lst = list(x for x in zip(*model_params))
        if len(lst):
            model_params = dict(zip(lst[0], lst[1]))
            print(model_params)

{'prior': 'symmetric'}
{'prior': 'asymmetric'}
{'prior': 'small'}
{'prior': 'heuristic'}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1}
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05}
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.1}
{'decorrelation_tau': 0.02}
{'decorrelation_tau': 0.05}
{'decorrelation_tau': 0.1}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.02}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.05}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.1}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.02}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.05}
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.1}
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.02}
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.05}
{'smooth_bcg_tau': 0.1, 'sparse_sp_

In [None]:
experiment_name_template = "num_topics_{}_{}"
experiment_directory=f"/data_mil/vbulatov/{config['batches_prefix']}_test1"

for model_family in KnownModel:
    template = PARAMS_EXPLORED[model_family]

    the_grid = [
            [[key, one_value] for one_value in template[key]]
            for key, params in template.items()
    ]
    for idx, model_params in enumerate(itertools.product(*the_grid)):
        lst = [x for x in zip(*model_params)]
        if len(lst):
            model_params = dict(zip(lst[0], lst[1]))
            print(model_params)
        else: 
            model_params = {}
        experiment_name = experiment_name_template.format(model_family.value, idx)
        optimizer = OptimizeScoresMethod(
            scores=build_every_score(train_dataset, test_dataset, config),
            model_family=model_family,
            experiment_name=experiment_name,
            experiment_directory=experiment_directory,
            one_model_num_processors=6,
            **trim_config(config, OptimizeScoresMethod)
        )
        t_start = time.time()

        optimizer.search_for_optimum(text_collection)
        t_end = time.time()
        print(model_family, (t_end - t_start) / 60 / 60)


{'prior': 'symmetric'}


 33%|███▎      | 1/3 [27:14:40<54:29:21, 98080.75s/it]

In [None]:
PARAMS_EXPLORED

In [None]:
all_models_mask = os.path.join(experiment_directory, experiment_name_template.format("*"), "*")

for entry in glob.glob(all_models_mask):
    print(entry)
    tm = TopicModel.load(entry)
    print(len(tm.topic_names), estimate_num_iterations_for_convergence(tm))



In [None]:
monotonity_and_std_analysis(
    experiment_name_template=experiment_name_template,
    experiment_directory=experiment_directory,
)

In [None]:

plot_everything_informative(experiment_directory, experiment_name_template, ["diversity"])


In [None]:
plot_everything_informative(experiment_directory, experiment_name_template, ["_sparsity"])


In [None]:
plot_everything_informative(experiment_directory, experiment_name_template, ['renyi'])


In [None]:
plot_everything_informative(experiment_directory, experiment_name_template, ['arun'])
plot_everything_informative(experiment_directory, experiment_name_template, ['calhar'])
plot_everything_informative(experiment_directory, experiment_name_template, ['silh'])


In [None]:
plot_everything_informative(experiment_directory, experiment_name_template, [], 
                            ["diversity", "_sparsity", 'renyi', 'arun', 'calhar', 'silh'])
