# Additional Scores Computation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import itertools
import os
import time
import sys
import tqdm

from typing import List

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
sys.path.insert(0, '../..')  # develop topicnet
sys.path.insert(0, '..')     # topnum

In [None]:
from topicnet.cooking_machine.models import TopicModel
from topicnet.cooking_machine.dataset import Dataset
from topicnet.cooking_machine.models import scores as tn_scores
from topicnet.cooking_machine.models.base_score import BaseScore as BaseTopicNetScore

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.search_methods.optimize_scores_method import OptimizeScoresMethod
from topnum.utils import (
    read_corpus_config, split_into_train_test, 
    build_every_score, monotonity_and_std_analysis, 
    trim_config, plot_everything_informative
)
from topnum.model_constructor import KnownModel, PARAMS_EXPLORED
from topnum.scores import (
    HoldoutPerplexityScore,
    MeanLiftScore,
    UniformThetaDivergenceScore,
)
from topnum.scores.base_score import BaseScore
from topnum.utils import estimate_num_iterations_for_convergence

## Building All Custom Scores (Auxiliary Step). Initializing Datasets

In [None]:
configs_dir = os.path.join('..', 'topnum', 'configs')

In [None]:
! ls $configs_dir

20NG.yml   PN.yml	ruwikigood.yml	WikiRef220.yml
Brown.yml  Reuters.yml	SO.yml


In [None]:
config = read_corpus_config(configs_dir + "/PN.yml")

In [None]:
config

OrderedDict([('name', 'PostNauka'),
             ('batches_prefix', 'PN'),
             ('dataset_path',
              '/data_mil/datasets/postnauka/PostNauka_natural_order.csv'),
             ('word', '@word'),
             ('min_num_topics', 5),
             ('max_num_topics', 50),
             ('num_topics_interval', 3),
             ('num_fit_iterations', 40),
             ('num_restarts', 3)])

In [None]:
config['dataset_path'] = '/data/datasets/postnauka/PostNauka_natural_order.csv'

In [None]:
config["batches_prefix"]

'PN'

In [None]:
dataset = Dataset(
    config['dataset_path'], 
    internals_folder_path=f'/home/alekseev/OptimalNumberOfTopics/demos/{config["batches_prefix"]}_internals'
)

In [None]:
dataset._data_path

'/data/datasets/postnauka/PostNauka_natural_order.csv'

In [None]:
_ = build_every_score(dataset, dataset, config)

Num documents for coherence: 44, 23410 words


In [None]:
MIN_DF_RATE = 0.01

d = dataset.get_dictionary()

d.filter(min_df_rate=MIN_DF_RATE)

dataset._cached_dict = d

train_dataset, test_dataset = split_into_train_test(dataset, config)

train_dataset._cached_dict = train_dataset.get_dictionary().filter(min_df_rate=MIN_DF_RATE)
test_dataset._cached_dict = test_dataset.get_dictionary().filter(min_df_rate=MIN_DF_RATE)

text_collection = VowpalWabbitTextCollection.from_dataset(
    train_dataset, main_modality=config['word']
)

In [None]:
text_collection._to_dataset().get_dictionary()

artm.Dictionary(name=fb4abb1f-b336-4b30-970a-956aa723ee09, num_entries=5214)

Experiments folder

In [None]:
! ls /data/_tmp_alekseev/OptNumExperiments/AllDatasets

20NG_20NG_NEW	 PN_PN_NEW	      SO_SO_NEW
Brown_Brown_NEW  Reuters_Reuters_NEW  WRef_NEW


In [None]:
ALL_DATASETS_FOLDER_PATH = '/data/_tmp_alekseev/OptNumExperiments/AllDatasets'
DATASET_FOLDER_NAME = 'PN_PN_NEW'

DATASET_FOLDER_PATH = os.path.join(
    ALL_DATASETS_FOLDER_PATH,
    DATASET_FOLDER_NAME,
)

In [None]:
! ls $DATASET_FOLDER_PATH

PN_ARTM_0_0   PN_ARTM_3_0  PN_ARTM_8_0		 PN_LDA_0_0	PN_sparse_1_0
PN_ARTM_0_1   PN_ARTM_3_1  PN_ARTM_8_1		 PN_LDA_0_1	PN_sparse_1_1
PN_ARTM_0_2   PN_ARTM_3_2  PN_ARTM_8_2		 PN_LDA_0_2	PN_sparse_1_2
PN_ARTM_1_0   PN_ARTM_4_0  PN_ARTM_9_0		 PN_LDA_1_0	PN_sparse_2_0
PN_ARTM_10_0  PN_ARTM_4_1  PN_ARTM_9_1		 PN_LDA_1_1	PN_sparse_2_1
PN_ARTM_10_1  PN_ARTM_4_2  PN_ARTM_9_2		 PN_LDA_1_2	PN_sparse_2_2
PN_ARTM_10_2  PN_ARTM_5_0  PN_decorrelation_0_0  PN_LDA_2_0	PN_sparse_3_0
PN_ARTM_1_1   PN_ARTM_5_1  PN_decorrelation_0_1  PN_LDA_2_1	PN_sparse_3_1
PN_ARTM_11_0  PN_ARTM_5_2  PN_decorrelation_0_2  PN_LDA_2_2	PN_sparse_3_2
PN_ARTM_11_1  PN_ARTM_6_0  PN_decorrelation_1_0  PN_PLSA_0_0	PN_TARTM_0_0
PN_ARTM_11_2  PN_ARTM_6_1  PN_decorrelation_1_1  PN_PLSA_0_1	PN_TARTM_0_1
PN_ARTM_1_2   PN_ARTM_6_2  PN_decorrelation_1_2  PN_PLSA_0_2	PN_TARTM_0_2
PN_ARTM_2_0   PN_ARTM_7_0  PN_decorrelation_2_0  PN_sparse_0_0
PN_ARTM_2_1   PN_ARTM_7_1  PN_decorrelation_2_1  PN_sparse_0_1
PN_ARTM_2_2   PN_ART

In [None]:
! ls $DATASET_FOLDER_PATH/PN_LDA_0_0

2563d7b9-bb6e-4607-8cb4-3ff4d2ee74f5  a42ff850-1d89-4d5f-a39d-d1be33286523
29fdd77e-078b-4b9c-bcd9-9c335c9e0479  b8ba10bd-d642-4628-be84-00e713c886aa
324ac57c-9be2-43c9-a403-1e444db066b0  b9e29ff5-d85f-48f2-93de-dd34355a1460
42d74265-6669-4ddc-a771-e0095c4f6191  c389c2a7-56e8-44e5-86d2-f2da86de4cc6
56ea63e6-82ac-4bce-a92d-71835367a980  ca8d1cf5-5b5f-49cb-92fc-61c03a51bb35
88491c14-7780-45cd-987a-7360cb07cb5a  ddee9ad3-3c10-49f5-9cd4-cfe49d113dc3
8ebd8257-bfa3-4edd-be89-7a00052df860  e58c2991-b630-4aa7-8943-2585ac26621b
9ec8edee-9ca9-48ce-a379-2f41783d5149  fc36eff9-49d5-4b63-9596-d30883332fca


In [None]:
def initialize_additional_scores() -> List[BaseScore]:
    return [
        HoldoutPerplexityScore(
            name='new_holdout_perp',
            test_dataset=test_dataset,
        ),
        MeanLiftScore(
            name='lift',
            validation_dataset=test_dataset,
            modalities=[config['word']],
        ),
        UniformThetaDivergenceScore(
            name='uni_theta_divergence',
            validation_dataset=test_dataset,
            modalities=[config['word']],
        )
    ]

In [None]:
def delete_score_files(model_folder_path: str, scores: List[BaseScore]) -> None:
    for f in os.listdir(model_folder_path):
        if any(f.startswith(n + '.') for n in [s.name for s in scores]):
            os.remove(os.path.join(model_folder_path, f))

In [None]:
def find_score(target_score: BaseScore, model: TopicModel) -> BaseTopicNetScore:
    score_objects = list()

    for score_name, score_object in model.custom_scores.items():
        if score_name != target_score.name:
            continue

        score_objects.append(score_object)

    assert len(score_objects) == 1

    return score_objects[0]

In [None]:
def save_score(score: BaseTopicNetScore, score_name: str, model_folder_path: str) -> None:
    class_name = score.__class__.__name__
    save_path = os.path.join(
        model_folder_path,
        '.'.join([score_name, class_name, 'p'])
    )

    score.save(save_path)

    saved_score = getattr(tn_scores, class_name).load(save_path)  # TODO: dirty

    assert len(saved_score.value) == 1  # OptimalNumberOfTopics-specific check

In [None]:
for restart_folder_name in tqdm.tqdm(
        os.listdir(DATASET_FOLDER_PATH)[:1],
        total=len(os.listdir(DATASET_FOLDER_PATH)),
        file=sys.stdout):

    restart_folder_path = os.path.join(DATASET_FOLDER_PATH, restart_folder_name)
    
    for model_folder_name in os.listdir(restart_folder_path)[:1]:
        model_folder_path = os.path.join(restart_folder_path, model_folder_name)

        scores_to_compute = initialize_additional_scores()

        delete_score_files(
            model_folder_path=model_folder_path,
            scores=scores_to_compute,
        )
        
        model = TopicModel.load(model_folder_path)

        for score in scores_to_compute:
            score._attach(model)

            score_object = find_score(score, model)

            score_value = score_object.call(model)
            score_object.update(score_value)  # TODO: carefully! this is kostyl kinda like

            save_score(
                score=score_object,
                score_name=score.name,
                model_folder_path=model_folder_path,
            )

Checking if all OK (for one model): what scores are saved

In [None]:
! ls $DATASET_FOLDER_PATH/PN_decorrelation_0_0/232c2a85-4893-43b8-9cea-9ead792eff8d

AIC_sparsity_False._LikelihoodBasedScore.p
AIC_sparsity_True._LikelihoodBasedScore.p
arun._SpectralDivergenceScore.p
BIC_sparsity_False._LikelihoodBasedScore.p
BIC_sparsity_True._LikelihoodBasedScore.p
calhar._CalinskiHarabaszScore.p
diversity_cosine_False._DiversityScore.p
diversity_cosine_True._DiversityScore.p
diversity_euclidean_False._DiversityScore.p
diversity_euclidean_True._DiversityScore.p
diversity_hellinger_False._DiversityScore.p
diversity_hellinger_True._DiversityScore.p
diversity_jensenshannon_False._DiversityScore.p
diversity_jensenshannon_True._DiversityScore.p
intra._IntratextCoherenceScore.p
lift._MeanLiftScore.p
MDL_sparsity_False._LikelihoodBasedScore.p
MDL_sparsity_True._LikelihoodBasedScore.p
model
new_holdout_perp._HoldoutPerplexityScore.p
params.json
phi.csv
renyi_0.5._RenyiShannonEntropyScore.p
renyi_1._RenyiShannonEntropyScore.p
renyi_2._RenyiShannonEntropyScore.p
silh._SilhouetteScore.p
toptok1._TopTokensCoherenceScore.p
uni_theta_d

Last model we worked with:

In [None]:
list(model.custom_scores.keys())

['BIC_sparsity_False',
 'BIC_sparsity_True',
 'AIC_sparsity_True',
 'calhar',
 'intra',
 'diversity_jensenshannon_True',
 'diversity_cosine_True',
 'MDL_sparsity_False',
 'silh',
 'diversity_cosine_False',
 'renyi_1',
 'arun',
 'toptok1',
 'diversity_euclidean_True',
 'diversity_hellinger_True',
 'renyi_0.5',
 'renyi_2',
 'diversity_hellinger_False',
 'MDL_sparsity_True',
 'AIC_sparsity_False',
 'diversity_euclidean_False',
 'diversity_jensenshannon_False',
 'new_holdout_perp',
 'lift',
 'uni_theta_divergence']