# TopicBank: Model Validation Experiment

Dataset: [PostNauka](https://postnauka.ru/) articles.

In [None]:
# General imports

import dill
import itertools
import json
import numpy as np
import os
import pandas as pd

from scipy.stats import gaussian_kde
from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
# Specific

from lapsolver import solve_dense

In [None]:
# Making `topnum` module visible for Python

import sys

sys.path.insert(0, '..')

In [None]:
# Optimal number of topics

from topicnet.cooking_machine import Dataset

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.scores import (
    IntratextCoherenceScore,
    SparsityPhiScore,
    SparsityThetaScore,
    SimpleTopTokensCoherenceScore,
    SophisticatedTopTokensCoherenceScore,
)
from topnum.scores._base_coherence_score import (
    SpecificityEstimationMethod,
    TextType,
    WordTopicRelatednessType
)
from topnum.scores.intratext_coherence_score import ComputationMethod
from topnum.search_methods import TopicBankMethod
from topnum.search_methods.topic_bank.one_model_train_funcs import (
    default_train_func,
    regularization_train_func,
    specific_initial_phi_train_func,
    background_topics_train_func,
)


## Data

In the folder below must reside the necessary data file in .csv format.

In [None]:
DATA_FOLDER_PATH = 'data'

In [None]:
os.listdir(DATA_FOLDER_PATH)

['bigartm.miptai.vasiliyalekseev.log.INFO.20200322-195518.20211',
 '_dataset_rxg0krms',
 'bigartm.miptai.vasiliyalekseev.log.INFO.20200322-200334.22948',
 'postnauka__dataset__natural_order_batches',
 'postnauka__vw__natural_order.txt',
 'bigartm.miptai.vasiliyalekseev.log.INFO.20200322-195026.19312',
 'bigartm.miptai.vasiliyalekseev.log.INFO.20200322-195237.19407',
 'bigartm.miptai.vasiliyalekseev.log.INFO.20200322-200237.21203',
 '_dataset_lthzj9qc',
 '_dataset_ux1_6tj1',
 '_dataset_hvteto6c',
 '_dataset_1fzjutvk',
 'bigartm.miptai.vasiliyalekseev.log.INFO.20200322-195348.19573',
 '_dataset_0o0fiiqo',
 'postnauka__vocab.txt',
 '_dataset_a00ok3up',
 '_dataset_lh9rv2te',
 '_dataset_0jpn3owo',
 'cooc',
 '_dataset_m9qhqs6p',
 '_dataset_5qy6w9hv',
 'bigartm.INFO',
 'twenty_newsgroups__vw__natural_order.txt',
 'postnauka__dataset__natural_order.csv']

In [None]:
dataset_file_name = 'postnauka__dataset__natural_order.csv'

In [None]:
dataset_file_path = os.path.join(
    DATA_FOLDER_PATH,
    dataset_file_name
)

In [None]:
dataset = Dataset(dataset_file_path)

In [None]:
dataset._data.shape

(3446, 2)

In [None]:
dataset._data.head()

Unnamed: 0_level_0,raw_text,vw_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
29998.txt,материал отрицательный показатель преломление ...,29998.txt |@text материал отрицательный показа...
7770.txt,культурный код экономика экономист александр а...,7770.txt |@text культурный код экономика эконо...
32230.txt,faq наука третий класс факт эксперимент резуль...,32230.txt |@text faq наука третий класс факт э...
27293.txt,обрушение волна поверхность жидкость математик...,27293.txt |@text обрушение волна поверхность ж...
481.txt,существовать ли суперсимметрия мир элементарны...,481.txt |@text существовать ли суперсимметрия ...


## Bank Creation

In [None]:
dummy_topic_score = IntratextCoherenceScore(
    name='intratext_coherence_score',
    data=dataset,
    documents=list(dataset.get_dataset().index)[:1],
)

In [None]:
# Default train func

train_funcs = default_train_func

In [None]:
# TODO: use Holdout Perplexity as Stop score

optimizer = TopicBankMethod(
    data = dataset,
    min_df_rate = 0.025,
    max_df_rate = 0.8,
    
    main_topic_score = dummy_topic_score,
    other_topic_scores = list(),
    other_scores = list(),
    
    max_num_models = 20,
    one_model_num_topics = 100,
    num_fit_iterations = 100,
    topic_score_threshold_percentile = 90,
    
    save_bank = True,
    save_model_topics = True,
    
    train_funcs = train_funcs,
)

Fulfilling the search:

In [None]:
%%time

optimizer.search_for_optimum(dataset)

100%|██████████| 10/10 [04:48<00:00, 28.85s/it]
CPU times: user 7min 18s, sys: 44.7 s, total: 8min 3s
Wall time: 4min 48s


In [None]:
topic_bank_path = '/tmp/tmpyz_4fwmqTopicBank_'

In [None]:
! ls $topic_bank_path

model_0__phi.bin	   model_4__phi.bin	      model_8__phi.bin
model_0__topic_scores.bin  model_4__topic_scores.bin  model_8__topic_scores.bin
model_1__phi.bin	   model_5__phi.bin	      model_9__phi.bin
model_1__topic_scores.bin  model_5__topic_scores.bin  model_9__topic_scores.bin
model_2__phi.bin	   model_6__phi.bin	      topics.bin
model_2__topic_scores.bin  model_6__topic_scores.bin  topic_scores.bin
model_3__phi.bin	   model_7__phi.bin
model_3__topic_scores.bin  model_7__topic_scores.bin


In [None]:
bank_phi = pd.DataFrame.from_dict(
    {
        f'topic_{i}': word_probs
        for i, word_probs in enumerate(
            dill.loads(open(os.path.join(topic_bank_path, 'topics.bin'), 'rb').read())
        )
    }
)

In [None]:
bank_phi.head()

Unnamed: 0,Unnamed: 1,topic_0,topic_1
@text,политический,0.001096,0.003082
@text,специалист,0.00043,0.000479
@text,летний,0.000114,3.9e-05
@text,каждый,0.001458,0.002008
@text,реформа,0.000343,0.000496


In [None]:
model_phi = dill.loads(
    open(os.path.join(topic_bank_path, 'model_0__phi.bin'), 'rb').read()
)

In [None]:
model_phi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_0,topic_1,topic_2,topic_3,topic_4
modality,token,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
@text,политический,0.001096,0.001584,0.002181,0.003269,0.000774
@text,специалист,0.00043,0.000626,0.000578,0.000762,7e-06
@text,летний,0.000114,5.7e-05,0.000137,8e-05,4.9e-05
@text,каждый,0.001458,0.003052,0.00214,0.003442,0.000251
@text,реформа,0.000343,0.000336,0.000399,4.2e-05,0.000126


In [None]:
costs = [
    [
        TopicBankMethod._jaccard_distance(
            model_phi.loc[:, model_topic].to_dict(),
            bank_phi.loc[:, bank_topic].to_dict()
        )
        for bank_topic in bank_phi.columns
    ]
    for model_topic in model_phi.columns
]

costs = np.array(costs, dtype=np.float32)

In [None]:
costs

array([[0.        , 0.61513114],
       [0.66375315, 0.6192726 ],
       [0.64335763, 0.5763088 ],
       [0.6515412 , 0.5998365 ],
       [0.66883266, 0.60312873]], dtype=float32)

In [None]:
rids, cids = solve_dense(costs)

In [None]:
num_good_topics = 0
num_found_topics = 0

DISTANCE_THRESHOLD = 0.5

for r, c in zip(rids, cids):
    if np.min(costs[r,:]) < DISTANCE_THRESHOLD:
        num_good_topics += 1
    
    if costs[r, c] < DISTANCE_THRESHOLD:
        num_found_topics += 1

In [None]:
print(f'Precision: {num_good_topics / model_phi.shape[1]:.3f}')

Precision: 0.200


In [None]:
print(f'Recall: {num_found_topics / model_phi.shape[1]:.3f}')

Recall: 0.200


In [None]:
print(f'Recall (relative): {num_found_topics / bank_phi.shape[1]:.3f}')

Recall (relative): 0.500
