# NumTopics: all datasets

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import itertools
import os
import time
import sys

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
sys.path.insert(0, '../..')  # develop topicnet
sys.path.insert(0, '..')     # topnum

In [5]:
from topicnet.cooking_machine.models import TopicModel
from topicnet.cooking_machine.dataset import Dataset

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.search_methods.optimize_scores_method import OptimizeScoresMethod
from topnum.utils import (
    read_corpus_config, split_into_train_test, 
    build_every_score, monotonity_and_std_analysis, 
    trim_config, plot_everything_informative
)
from topnum.model_constructor import KnownModel, PARAMS_EXPLORED
from topnum.utils import estimate_num_iterations_for_convergence

## Experiments

In [6]:
configs_dir = os.path.join('..', 'topnum', 'configs')
configs_mask = os.path.join(configs_dir, '*.yml')


for config_file in glob.glob(configs_mask):
    config = read_corpus_config(config_file)

    print(config['name'], config_file)

20NewsGroups ../topnum/configs/20NG.yml
RuWikiGood ../topnum/configs/ruwikigood.yml
StackOverflow ../topnum/configs/SO.yml
WikiRef220 ../topnum/configs/WikiRef220.yml
PostNauka ../topnum/configs/PN.yml
Reuters ../topnum/configs/Reuters.yml
Brown ../topnum/configs/Brown.yml


In [7]:
# Dataset with Natural order of words (if possible)!

config = read_corpus_config(os.path.join(configs_dir, 'WikiRef220.yml'))

# For debug

config['num_restarts'] = 1
config['num_fit_iterations'] = 5
config['num_topics_interval'] = 5

In [8]:
config

OrderedDict([('name', 'WikiRef220'),
             ('dataset_path',
              '/data_mil/datasets/WikiRef220/wiki_ref220_natural_order.csv'),
             ('batches_prefix', 'WRef'),
             ('word', '@lemmatized'),
             ('min_num_topics', 2),
             ('max_num_topics', 20),
             ('num_topics_interval', 5),
             ('num_fit_iterations', 5),
             ('num_restarts', 1)])

In [9]:
DATASET_PATH = os.path.join(
    '/', 'data', 'datasets', 'WikiRef220', 'wiki_ref220_natural_order.csv'
)

assert os.path.isfile(DATASET_PATH)

config['dataset_path'] = DATASET_PATH

DATASET_INTERNALS_FOLDER_PATH = os.path.join(
    '.', f'{config["batches_prefix"]}__internals'
)

In [10]:
dataset = Dataset(
    data_path=DATASET_PATH, 
    internals_folder_path=DATASET_INTERNALS_FOLDER_PATH,
)

In [11]:
MIN_DF_RATE = 0.001

In [12]:
import os

train_dataset, test_dataset = split_into_train_test(dataset, config)

for d in [train_dataset, test_dataset]:
    d._cached_dict = d.get_dictionary().filter(min_df_rate=MIN_DF_RATE, min_df=2)

    #for modality in d.get_possible_modalities():
    #    if modality != config['word']:
    #        d._cached_dict.filter(class_id=modality, max_df=0, inplace=True)

    #dict_path = os.path.join(d._internals_folder_path, 'dict.dict')
    #os.remove(dict_path)
    #d._cached_dict.save(dict_path)

In [13]:
text_collection = VowpalWabbitTextCollection.from_dataset(
    train_dataset,
    main_modality=config['word'],
)

In [14]:
text_collection._to_dataset().get_dictionary()

artm.Dictionary(name=ea5924d9-df5e-4f73-b2a1-515edfb12307, num_entries=6148)

In [18]:
! rm -rf $EXPERIMENT_DIRECTORY/WRef_test_TARTM_0_0

In [20]:
! rm -rf $EXPERIMENT_DIRECTORY/WRef_test_decorrelation_0_0

In [19]:
EXPERIMENT_NAME_TEMPLATE = f"{config['batches_prefix']}_test" + '_{}_{}'
EXPERIMENT_DIRECTORY = os.path.join(
    '/', 'data', '_tmp_bulatov', f"{config['batches_prefix']}"
)
ONE_MODEL_NUM_PROCESSORS = 3


for model_family in KnownModel:
    template = PARAMS_EXPLORED[model_family]

    the_grid = [
        [[key, one_value] for one_value in template[key]]
        for key, params in template.items()
    ]
    
    print(model_family)
    
    for idx, model_params in enumerate(itertools.product(*the_grid)):

        print(model_params)
        
        lst = [x for x in zip(*model_params)]

        if len(lst):
            model_params = dict(zip(lst[0], lst[1]))
        else: 
            model_params = {}
        
        print(model_params)
        
        experiment_name = EXPERIMENT_NAME_TEMPLATE.format(model_family.value, idx)
        if os.path.isdir(EXPERIMENT_DIRECTORY + "/" + experiment_name + "_0"):
            print(f"skipping {experiment_name}...")
            continue

        built_scores = build_every_score(train_dataset, test_dataset, config)
        optimizer = OptimizeScoresMethod(
            scores=built_scores,
            model_family=model_family,
            experiment_name=experiment_name,
            experiment_directory=EXPERIMENT_DIRECTORY,
            one_model_num_processors=ONE_MODEL_NUM_PROCESSORS,
            model_params=model_params,
            **trim_config(config, OptimizeScoresMethod)
        )
        
        t_start = time.time()

        optimizer.search_for_optimum(text_collection)
        
        t_end = time.time()
        
        print(model_family, (t_end - t_start) / 60 / 60)

  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.LDA
(['prior', 'symmetric'],)
{'prior': 'symmetric'}
skipping WRef_test_LDA_0...
(['prior', 'asymmetric'],)
{'prior': 'asymmetric'}
skipping WRef_test_LDA_1...
(['prior', 'heuristic'],)
{'prior': 'heuristic'}
skipping WRef_test_LDA_2...
KnownModel.PLSA
()
{}
skipping WRef_test_PLSA_0...
KnownModel.SPARSE
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.05])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05}
skipping WRef_test_sparse_0...
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.1])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1}
skipping WRef_test_sparse_1...
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.05])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05}
skipping WRef_test_sparse_2...
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.1])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.1}
skipping WRef_test_sparse_3...
KnownModel.TLESS
()
{}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:43<00:00, 103.02s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.TLESS 0.028642453087700737
KnownModel.DECORRELATION
(['decorrelation_tau', 0.02],)
{'decorrelation_tau': 0.02}
skipping WRef_test_decorrelation_0...
(['decorrelation_tau', 0.05],)
{'decorrelation_tau': 0.05}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:36<00:00, 96.39s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.DECORRELATION 0.026823001239034864
(['decorrelation_tau', 0.1],)
{'decorrelation_tau': 0.1}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:35<00:00, 95.84s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.DECORRELATION 0.0266510374016232
KnownModel.ARTM
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.02])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.02}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:38<00:00, 98.07s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.02727388898531596
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.05])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.05}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:42<00:00, 102.29s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.028442016111479864
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.1])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.1}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:41<00:00, 101.91s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.02833658708466424
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.02])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.02}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:40<00:00, 100.14s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.027844164437717863
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.05])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.05}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:42<00:00, 102.29s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.028439755903349982
(['smooth_bcg_tau', 0.05], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.1])
{'smooth_bcg_tau': 0.05, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.1}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:43<00:00, 103.08s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.028666461176342434
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.02])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.02}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:41<00:00, 101.79s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.028301187886132135
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.05])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.05}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:42<00:00, 102.03s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.028366706437534757
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.05], ['decorrelation_tau', 0.1])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.05, 'decorrelation_tau': 0.1}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:36<00:00, 96.37s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.026801313161849975
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.02])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.02}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:42<00:00, 102.64s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.02853917333814833
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.05])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.05}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:46<00:00, 106.47s/it]
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  0%|          | 0/1 [00:00<?, ?it/s]

KnownModel.ARTM 0.029601087503963047
(['smooth_bcg_tau', 0.1], ['sparse_sp_tau', -0.1], ['decorrelation_tau', 0.1])
{'smooth_bcg_tau': 0.1, 'sparse_sp_tau': -0.1, 'decorrelation_tau': 0.1}
Num documents for coherence: 43, 24274 words


100%|██████████| 1/1 [01:45<00:00, 105.58s/it]

KnownModel.ARTM 0.029353886180453828



  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [None]:
all_models_mask = os.path.join(
    EXPERIMENT_DIRECTORY,
    EXPERIMENT_NAME_TEMPLATE.format("*", "*"),
    "*",
)

num_models_to_load = 5

for entry in glob.glob(all_models_mask)[:num_models_to_load]:
    print(entry)
    
    tm = TopicModel.load(entry)
    num_iters = estimate_num_iterations_for_convergence(tm)
    
    print(
        f'Num topics: {len(tm.topic_names):3}. Num iters for convergence: {num_iters:3}'
    )

In [None]:
monotonity_and_std_analysis(
    experiment_name_template=EXPERIMENT_NAME_TEMPLATE,
    experiment_directory=EXPERIMENT_DIRECTORY,
)

In [None]:
plot_everything_informative(
    EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ["diversity"]
)

In [None]:
plot_everything_informative(
    EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ["_sparsity"]
)

In [None]:
plot_everything_informative(EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ['renyi'])

In [None]:
plot_everything_informative(EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ['arun'])
plot_everything_informative(EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ['calhar'])
plot_everything_informative(EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE, ['silh'])

In [None]:
plot_everything_informative(
    EXPERIMENT_DIRECTORY, EXPERIMENT_NAME_TEMPLATE,
    [], 
    ["diversity", "_sparsity", 'renyi', 'arun', 'calhar', 'silh']
)