# TopicBank: Bank Creation Experiment

Dataset: [PostNauka](https://postnauka.ru/) articles.

In [2]:
# General imports

import itertools
import json
import numpy as np
import os
import pandas as pd

from scipy.stats import gaussian_kde
from matplotlib import pyplot as plt

%matplotlib inline

In [3]:
import sys

In [4]:
# Making `topnum` module visible for Python

# TODO
# import sys

sys.path.insert(0, '..')
sys.path.insert(0, '../..')

In [5]:
# Optimal number of topics

from topicnet.cooking_machine import Dataset

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.scores import (
    IntratextCoherenceScore,
    SparsityPhiScore,
    SparsityThetaScore,
    SimpleTopTokensCoherenceScore,
    SophisticatedTopTokensCoherenceScore,
)
from topnum.scores._base_coherence_score import (
    SpecificityEstimationMethod,
    TextType,
    WordTopicRelatednessType
)
from topnum.scores.intratext_coherence_score import ComputationMethod
from topnum.search_methods import TopicBankMethod
from topnum.search_methods.topic_bank.one_model_train_funcs import (
    default_train_func,
    regularization_train_func,
    specific_initial_phi_train_func,
    background_topics_train_func,
)


## Data

In the folder below must reside the necessary data file in .csv format.

In [6]:
DATA_FOLDER_PATH = './data'  # TODO  '/data/datasets'

In [7]:
os.listdir(DATA_FOLDER_PATH)

['AG_News.csv',
 'PostNauka_natural_order.csv',
 'Brown.csv',
 'PScience.csv',
 'Post_Science',
 'Reuters.csv',
 '.ipynb_checkpoints',
 '20NG_natural_order.csv',
 'Watan2004.csv']

In [8]:
DATASET_FOLDER_NAME = 'Watan2004' # 'AG_News' # '20NG_natural_order' # 'Brown' # 'Reuters' # 'Post_Science'

In [9]:
os.listdir(os.path.join(DATA_FOLDER_PATH, DATASET_FOLDER_NAME))

FileNotFoundError: [Errno 2] No such file or directory: './data/Watan2004'

In [10]:
DATASET_FILE_NAME = 'Watan2004.csv' # 'AG_News.csv' # '20NG_natural_order.csv' # 'Brown.csv' # 'Reuters.csv' # 'PScience.csv'

In [11]:
DATASET_FILE_PATH = os.path.join(
    DATA_FOLDER_PATH,
    # DATASET_FOLDER_NAME,
    DATASET_FILE_NAME,
)

Checking if all OK with data, what modalities does the collection have.

In [12]:
! head -n 2 $DATASET_FILE_PATH

id,vw_text,raw_text
"doc_0_00000","doc_0_00000 |@word بكي ا ش ا اعلن صين رسمي كتمل طول قدم ملحم شعري علي مستوي عالم معروف باسم ملحم ملك قصار اثر عثور علي جزئ ناقص وهي عبار الف كلم منقوش علي حجر تماثيل كائن بمعبد جين لونغ سمك ذهبي بمحافظ دانب بمقاطع سيتشو جنوب غرب صين عتبر ملحم قصار طول ملحم شعبي ثقافي عرف عالم تقع 36 مجلد تضم قراب مليوني بيت شعر ضاهي قيم ادبي شهر ملاحم غربي حتي انه طلق علي الياذ شرقي علي غرار ملحم يوناني خالد الياذ اوديسا شاعر هوميروس كتب ملحم قبل الف عام تناول قصة حيا ملك اسطوري قصار لذي هزم قبائل غزو عدو حافظ علي هوي مقاطع تبت جنوب غرب صين طمس تشوي وقد ترجم لغت اصلي لتي كتب وهي منغولي الي عديد لغ عالمي انجليزي فرنسي الماني روسي هندي صيني ياباني ،وتدرس قسم ادب معاهد كلي جامع كثر اربع دول وقد صدر بشا قراب ثلاث الف بحث دراس عديد دوائر اكديمية عالمي كان منظم يونسكو ادرج ضمن مناسب عالمي لتي شار حتفل جميع دول عضء منظم خلال عام ماضي تخليد لتل ملحم فريد باعتبار اثر ثقافي وحيد متبقي حتي يوم رغم مرور كثر الف عام علي صدور","   بكين أ ش أ أعلنت الصين رسميا عن                   

In [13]:
DATASET_INTERNALS_FOLDER_PATH = os.path.join('.', DATASET_FOLDER_NAME + '__internals')

In [14]:
DATASET_INTERNALS_FOLDER_PATH

'./Watan2004__internals'

In [15]:
dataset = Dataset(
    DATASET_FILE_PATH,
    internals_folder_path=DATASET_INTERNALS_FOLDER_PATH
)

In [16]:
os.listdir(DATASET_INTERNALS_FOLDER_PATH)

['dict.dict',
 'new_ppmi_tf_',
 'vocab.txt',
 'cooc_values.json',
 'batches',
 'ppmi_tf_',
 'vw.txt',
 'result']

In [138]:
dataset.get_batch_vectorizer()

artm.BatchVectorizer(data_path="./Watan2004__internals/batches", num_batches=21)

In [139]:
os.listdir(DATASET_INTERNALS_FOLDER_PATH)

['batches', 'vw.txt']

In [140]:
dataset._data.shape

(20291, 3)

In [141]:
dataset._data.head()

Unnamed: 0_level_0,id,vw_text,raw_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
doc_0_00000,doc_0_00000,doc_0_00000 |@word بكي ا ش ا اعلن صين رسمي كتم...,بكين أ ش أ أعلنت الصين رسميا عن ...
doc_0_00001,doc_0_00001,doc_0_00001 |@word حاور دمشق حيد تاج ادب روائي...,حاوره في دمشق وحيد تاجا: الأدب الروائي.. ...
doc_0_00002,doc_0_00002,doc_0_00002 |@word كتب سالم رحبي رعي معالي دكت...,كتب سالم الرحبي: ترعى معالي الدكتورة ...
doc_0_00003,doc_0_00003,doc_0_00003 |@word صلال سعيد شاطر رعي سعاد شيخ...,صلالة من سعيد الشاطر: رعى سعادة ...
doc_0_00004,doc_0_00004,doc_0_00004 |@word باريس كون وقت لذي عالج ابن ...,باريس كونا: في الوقت الذي يعالج فيه ابنه ...


In [20]:
len(dataset.get_vw_document('32230.txt').loc['32230.txt', 'vw_text'].split())

710

In [21]:
dataset.get_vw_document('32230.txt').loc['32230.txt', 'vw_text']

'|@text faq наука третий класс факт эксперимент результат полностью предсказывать теория начало известно со школьный скамья задача любой естественный наука число физика открытие новый знание природа разработка принципиально новый технология число первый относиться работа ньютон максвелл эйнштейн фейнман пример второй являться изобретение транзистор лазер технология являться результат целенаправленный научный изыскание входить каждый дом перевертывать мир последний год мировой научный мода входить совершенно другой класс работа исследование известный явление открывать дорога новый неожиданный интерпретация ломать общепринятый стереотип иллюстрировать новый научный идея траектория фотон post idв конец авторитетный научнопопулярный английский издание physics world опубликовывать рейтинг самый выдающийся достижение физика первый место список стоять работа проводить канадский университет торонто руководство профессор эфраим стайнберг суть работа согласно принцип неопределенность гейзенберг 

## Coocs

The notebook [Making-Decorrelation-and-Topic-Selection-Friends.ipynb](https://github.com/machine-intelligence-laboratory/TopicNet/blob/master/topicnet/demos/Making-Decorrelation-and-Topic-Selection-Friends.ipynb) contains a bit more explanation and references concerning cooccurrences computation in ARTM library.

In [22]:
COOC_DATA_FOLDER_PATH = os.path.join(DATASET_INTERNALS_FOLDER_PATH, 'cooc')

In [23]:
if os.path.isdir(COOC_DATA_FOLDER_PATH):
    print(os.listdir(COOC_DATA_FOLDER_PATH))

In [18]:
cooc_values_file_path = os.path.join(
    DATASET_INTERNALS_FOLDER_PATH,
    'cooc_values.json',
)

In [19]:
if os.path.isfile(cooc_values_file_path):
    print(
        json.loads(open(cooc_values_file_path, 'r').read())[:20]
    )

[[['هرير', 'تمني'], 1.82768], [['تمني', 'هرير'], 1.82768], [['هرير', 'كبر'], 0.581746], [['كبر', 'هرير'], 0.581746], [['هرير', 'داود'], 2.47035], [['داود', 'هرير'], 2.47035], [['هرير', 'تلق'], 2.12148], [['تلق', 'هرير'], 2.12148], [['هرير', 'بدو'], 0.443265], [['بدو', 'هرير'], 0.443265], [['هرير', 'سعيد'], 0.706545], [['سعيد', 'هرير'], 0.706545], [['هرير', 'ملائك'], 2.00346], [['ملائك', 'هرير'], 2.00346], [['هرير', 'اني'], 1.07549], [['اني', 'هرير'], 1.07549], [['هرير', 'تفتيش'], 2.54284], [['تفتيش', 'هرير'], 2.54284], [['هرير', 'مر'], 1.10353], [['مر', 'هرير'], 1.10353]]


In [20]:
if not os.path.isfile(cooc_values_file_path):
    cooc_values = dict()
else:
    raw_cooc_values = json.loads(open(cooc_values_file_path, 'r').read())

    cooc_values = {
        tuple(d[0]): d[1] for d in raw_cooc_values
    }

In [21]:
len(list(cooc_values.items()))

3514220

In [22]:
print(list(cooc_values.items())[:10])

[(('هرير', 'تمني'), 1.82768), (('تمني', 'هرير'), 1.82768), (('هرير', 'كبر'), 0.581746), (('كبر', 'هرير'), 0.581746), (('هرير', 'داود'), 2.47035), (('داود', 'هرير'), 2.47035), (('هرير', 'تلق'), 2.12148), (('تلق', 'هرير'), 2.12148), (('هرير', 'بدو'), 0.443265), (('بدو', 'هرير'), 0.443265)]


In [23]:
COOC_VALUES = cooc_values

In [24]:
len(COOC_VALUES)

3514220

In [25]:
COOC_VALUES = {
    k: v for i, (k, v) in enumerate(COOC_VALUES.items()) if i % 2 == 0
}

In [26]:
len(COOC_VALUES)

1757110

In [24]:
del cooc_values

In [27]:
median_cooc = np.percentile(list(COOC_VALUES.values()), 60)

In [28]:
median_cooc

0.9181784

In [29]:
COOC_VALUES = {
    k: v for i, (k, v) in enumerate(COOC_VALUES.items()) if v >= median_cooc
}

In [30]:
len(COOC_VALUES)

702844

In [158]:
print(list(COOC_VALUES.items())[:10])

[(('states', 'section'), 0.960109), (('states', 'basis'), 0.329515), (('states', 'defend'), 1.82707), (('states', 'existence'), 0.0594168), (('states', 'force'), 1.10638), (('states', 'express'), 1.33526), (('states', 'west'), 0.530156), (('states', 'learn'), 0.0611878), (('states', 'additional'), 0.428594), (('states', 'proper'), 1.72512)]


In [180]:
from topicnet.cooking_machine.models import TopicModel
import artm

In [183]:
a = artm.ARTM(10)
a.initialize(dataset.get_dictionary())

In [184]:
tm = TopicModel(a)

In [185]:
tm._fit(dataset.get_batch_vectorizer(), 1)

In [186]:
tm.get_phi().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
modality,token,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
@word,كرديو,7.634554e-08,1.435572e-08,3.222526e-07,3.30583e-08,7.221951e-08,9.443286e-08,1.186938e-07,1.482633e-07,5.519292e-08,1.494534e-07
@word,سيانيد,1.079096e-08,1.201125e-07,5.929413e-09,2.212184e-07,1.473765e-07,1.852361e-08,7.600426e-08,1.38241e-07,1.181957e-07,3.563748e-07
@word,اتفاي,2.399903e-07,2.337123e-07,4.510908e-07,1.028343e-07,2.932988e-07,1.215687e-08,1.628408e-07,1.515525e-07,5.858392e-07,2.536662e-08
@word,اختيارالكلم,4.796336e-08,1.434721e-07,1.377251e-07,5.181442e-08,2.077898e-07,2.645868e-07,2.835146e-09,4.897009e-08,1.278964e-07,8.651309e-08
@word,لنساهل,3.547773e-08,4.450235e-08,1.22288e-07,5.289273e-08,1.177817e-07,3.532562e-07,2.064689e-09,1.368329e-08,2.581789e-07,1.469993e-07


In [187]:
del tm
del a

In [188]:
import tqdm

In [335]:
lengths = []

for d in tqdm.tqdm(dataset._data.index, total=len(dataset._data.index), file=sys.stdout):
    lengths.append(
        len(dataset.get_vw_document(d).loc[d, 'vw_text'].split())
    )

  0%|          | 486/127600 [00:13<56:51, 37.26it/s]  


KeyboardInterrupt: 

In [31]:
lengths = dataset._data['vw_text'].apply(lambda text: len(text.split()))

In [32]:
mean_length = sum(lengths) / dataset._data.shape[0]

In [33]:
median_length = np.median(lengths)

In [None]:
# PN

mean_length

In [101]:
median_length

461.5

In [154]:
461.5 * 50

23075.0

In [108]:
# Reuters

mean_length

96.72923618835743

In [109]:
median_length

70.0

In [155]:
300 * 70

21000

In [33]:
# Brown

mean_length

1077.038

In [34]:
median_length

1073.0

In [35]:
1073.0 * 20

21460.0

In [162]:
# 20 NG

mean_length

139.84532526796136

In [163]:
median_length

75.0

In [164]:
300 * 75

22500

In [30]:
# AG News

mean_length

29.167257053291536

In [31]:
median_length

29.0

In [34]:
800 * 29

23200

In [32]:
23000 / 29

793.1034482758621

In [32]:
# Watan

mean_length

422.5294958355921

In [33]:
median_length

292.0

In [34]:
23000 / median_length

78.76712328767124

In [38]:
p_25 = np.percentile(lengths, 25)
p_75 = np.percentile(lengths, 75)

In [197]:
p_75

524.0

In [198]:
p_25

174.0

In [199]:
median_length * 20

5840.0

In [39]:
all_documents = [
    d for i, d in enumerate(dataset._data.index)
    if (
        lengths[i] <= p_75
        and
        lengths[i] >= p_25
    )
]

In [40]:
del lengths

In [41]:
len(all_documents)

10202

In [210]:
dataset._data.index.nunique(), dataset._data.shape

(20291, (20291, 3))

In [88]:
seed = 42 # 0 # 42 # 11221963
random = np.random.RandomState(seed)

In [89]:
old_test_documents = test_documents

In [90]:
num_test_documents = 80 # 800 # 300 # 20 # 300 # 50
test_documents = random.choice(all_documents, size=num_test_documents, replace=False)

In [91]:
median_length * num_test_documents

23360.0

In [92]:
len(test_documents)

80

In [93]:
len(set(test_documents))

80

In [94]:
test_documents[:10]

array(['doc_4_00214', 'doc_5_02707', 'doc_0_02199', 'doc_2_01677',
       'doc_6_03308', 'doc_6_01353', 'doc_1_03270', 'doc_6_03218',
       'doc_1_04240', 'doc_1_03776'], dtype='<U11')

In [95]:
test_documents = list(test_documents)

In [96]:
len(set(old_test_documents).intersection(set(test_documents)))

1

In [97]:
set(old_test_documents).intersection(set(test_documents))

{'doc_2_01908'}

## Scores (for Topics and Models)

In [98]:
WINDOW = 20
NUM_TOP_WORDS = 20
MAX_NUM_OUT_WORDS = 5

VERBOSE = False

In [99]:
# Default scores in Topic Bank

main_topic_score = IntratextCoherenceScore(
    name='intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_none',
    data=dataset,
    documents=test_documents,
    text_type=TextType.VW_TEXT,
    computation_method=ComputationMethod.SEGMENT_WEIGHT,
    word_topic_relatedness=WordTopicRelatednessType.PWT,
    specificity_estimation=SpecificityEstimationMethod.NONE,
    max_num_out_of_topic_words=MAX_NUM_OUT_WORDS,
    window=WINDOW,
    verbose=VERBOSE,
)

other_topic_scores = [
    SophisticatedTopTokensCoherenceScore(
        name='top_tokens_coherence_score__tt_vw__wtrt_pwt__sem_none',
        data=dataset,
        documents=test_documents,
        text_type=TextType.VW_TEXT,
        word_topic_relatedness=WordTopicRelatednessType.PWT,
        specificity_estimation=SpecificityEstimationMethod.NONE,
        num_top_words=NUM_TOP_WORDS,
        window=WINDOW,
        verbose=VERBOSE,
    )
]

In [100]:
# Other coherence scores variations

text_type_ids = {
    TextType.VW_TEXT: 'vw',
}
computation_method_ids = {
    ComputationMethod.SEGMENT_WEIGHT: 'seg_weight',
    ComputationMethod.SEGMENT_LENGTH: 'seg_length',
    ComputationMethod.SUM_OVER_WINDOW: 'sow',
}
word_topic_relatedness_type_ids = {
    WordTopicRelatednessType.PWT: 'pwt',
    WordTopicRelatednessType.PTW: 'ptw',
}
specificity_estimation_method_ids = {
    SpecificityEstimationMethod.NONE: 'none',
    SpecificityEstimationMethod.AVERAGE: 'av',
    SpecificityEstimationMethod.MAXIMUM: 'max',
}


param_combinations_intratext = list(
    itertools.product(
        text_type_ids,
        computation_method_ids,
        word_topic_relatedness_type_ids,
        specificity_estimation_method_ids,
    )
)
param_combinations_intratext.remove(
    (
        TextType.VW_TEXT,
        ComputationMethod.SEGMENT_WEIGHT,
        WordTopicRelatednessType.PWT,
        SpecificityEstimationMethod.NONE
    )
)

param_combinations_top_tokens = list(
    itertools.product(
        text_type_ids,
        word_topic_relatedness_type_ids,
        specificity_estimation_method_ids,
    )
)
param_combinations_top_tokens.remove(
    (
        TextType.VW_TEXT,
        WordTopicRelatednessType.PWT,
        SpecificityEstimationMethod.NONE
    )
)


for param_combination in param_combinations_intratext:
    (text_type,
     computation_method,
     word_topic_relatedness,
     specificity_estimation) = param_combination

    name = (
        f'intratext_coherence_score'
        f'__tt_{text_type_ids[text_type]}'
        f'__cm_{computation_method_ids[computation_method]}'
        f'__wtrt_{word_topic_relatedness_type_ids[word_topic_relatedness]}'
        f'__sem_{specificity_estimation_method_ids[specificity_estimation]}'
    )

    other_topic_scores.append(
        IntratextCoherenceScore(
            name=name,
            data=dataset,
            documents=test_documents,
            text_type=text_type,
            computation_method=computation_method,
            word_topic_relatedness=word_topic_relatedness,
            specificity_estimation=specificity_estimation,
            max_num_out_of_topic_words=MAX_NUM_OUT_WORDS,
            window=WINDOW,
            verbose=VERBOSE,
        )
    )


for param_combination in param_combinations_top_tokens:
    (text_type,
     word_topic_relatedness,
     specificity_estimation) = param_combination

    name = (
        f'top_tokens_coherence_score'
        f'__tt_{text_type_ids[text_type]}'
        f'__wtrt_{word_topic_relatedness_type_ids[word_topic_relatedness]}'
        f'__sem_{specificity_estimation_method_ids[specificity_estimation]}'
    )

    other_topic_scores.append(
        SophisticatedTopTokensCoherenceScore(
            name=name,
            data=dataset,
            documents=test_documents,
            text_type=text_type,
            word_topic_relatedness=word_topic_relatedness,
            specificity_estimation=specificity_estimation,
            #word_cooccurrences=COOC_VALUES2,  # TODO!
            num_top_words=NUM_TOP_WORDS,
            window=WINDOW,
            verbose=VERBOSE,
        )
    )

In [101]:
len(other_topic_scores)

23

In [102]:
len(COOC_VALUES)

702844

In [103]:
# Another implementation of top-tokens-based coherence

# param_combinations_other_top_tokens = list(
#     itertools.product([True, False], ['median', 'mean'], [None, 1e-7])
# )

param_combinations_other_top_tokens = list(
    itertools.product([True], ['median'], [1e-7])
)

if len(COOC_VALUES) > 0:  # with pre-computed coocs
    for param_combination in param_combinations_other_top_tokens:
        (kernel,
         average,
         active_topic_threshold) = param_combination

        name = (
            f'top_tokens_coherence_other_implementation_score'
            f'__ker_{kernel}'
            f'__av_{average}'
            f'__att_{active_topic_threshold}'
        )

        other_topic_scores.append(
            SimpleTopTokensCoherenceScore(
                name=name,
                data=dataset,
                cooccurrence_values=COOC_VALUES,
                num_top_tokens=20,
                kernel=kernel,
                average=average,
                active_topic_threshold=active_topic_threshold,
            )
        )

In [104]:
len(other_topic_scores)

24

In [105]:
# Default scores in Topic Bank

other_scores = [
    SparsityPhiScore(
        name='sparsity_phi_score'
    ),
    SparsityThetaScore(
        name='sparsity_theta_score'
    ),
]

## Bank Creation

In [106]:
# Default train func

TRAIN_FUNCS = default_train_func

In [221]:
os.makedirs(os.path.join(DATASET_INTERNALS_FOLDER_PATH, 'result'))

In [107]:
os.path.isdir(os.path.join(DATASET_INTERNALS_FOLDER_PATH, 'result', f'bank__{seed}'))

False

In [108]:
os.path.join(DATASET_INTERNALS_FOLDER_PATH, 'result', f'bank__{seed}')

'./Watan2004__internals/result/bank__42'

In [109]:
os.makedirs(os.path.join(DATASET_INTERNALS_FOLDER_PATH, 'result', f'bank__{seed}'))

In [110]:
seed

42

In [115]:
# TODO: use Holdout Perplexity as Stop score

start_model_number = 1  # TODO

optimizer = TopicBankMethod(
    data = dataset,
    min_df_rate = 0.025,
    max_df_rate = 0.8,

    main_topic_score = main_topic_score,
    other_topic_scores = other_topic_scores,
    other_scores = other_scores,
    
    documents = test_documents,

    start_model_number = start_model_number,
    max_num_models = 20,
    one_model_num_topics = 100,
    num_fit_iterations = 100,
    topic_score_threshold_percentile = 90,

    save_bank = True,
    save_model_topics = True,
    
    save_file_path = os.path.join(
        DATASET_INTERNALS_FOLDER_PATH, 'result', f'search_result__{seed}.json'),
    bank_folder_path = os.path.join(
        DATASET_INTERNALS_FOLDER_PATH, 'result', f'bank__{seed}'),

    train_funcs = TRAIN_FUNCS,
    
    verbose = True,
)



In [116]:
optimizer._result.keys()

dict_keys(['optimum', 'optimum_std', 'bank_scores', 'bank_topic_scores', 'model_scores', 'model_topic_scores', 'num_bank_topics', 'num_model_topics'])

In [117]:
optimizer._result['num_bank_topics']

[10]

In [118]:
len(optimizer._result['num_bank_topics'])

1

In [76]:
optimizer._save_file_path

'./Watan2004__internals/result/search_result__11221963.json'

In [87]:
len(optimizer._result['num_bank_topics'])

20

In [127]:
optimizer._topic_bank._path

'./AG_News__internals/result/bank__0'

In [84]:
optimizer._topic_bank.view_topics().head()

Unnamed: 0,Unnamed: 1,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
@word,تمني,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.471352e-14,0.0,5.84598e-05
@word,جوز,0.0,0.0,0.0,0.0,0.0,0.0,0.0004923269,0.0,0.0,0.0
@word,ربط,7.607563e-07,0.0,0.0,7e-06,0.0,1.301171e-07,4.888294e-14,0.0,3.320361e-10,0.0
@word,خط,7.317736e-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.579498e-10,0.0
@word,كبر,3.088602e-11,6e-06,0.0,0.002187,7.476166e-13,3.381091e-08,2.038143e-10,0.0007529581,9.068673e-07,1.399392e-10


In [63]:
optimizer._topic_bank.view_topics().shape

(1913, 10)

In [85]:
optimizer._topic_bank.view_topics()['topic_2'].sort_values(ascending=False)[:10]

@word  و       0.550311
       لذي     0.023649
       شيخ     0.019949
       فقد     0.006017
       صدر     0.006008
       قائم    0.005285
       رحم     0.005047
       حكم     0.004551
       وفي     0.004416
       سعيد    0.004405
Name: topic_2, dtype: float64

Fulfilling the search:

In [66]:
all_scores = [main_topic_score] + other_topic_scores

In [73]:
len(all_scores)

25

In [74]:
for i, s in enumerate(all_scores):
    print(f'{i + 1:2} {s._name}')

 1 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_none
 2 top_tokens_coherence_score__tt_vw__wtrt_pwt__sem_none
 3 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_av
 4 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_max
 5 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_none
 6 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_av
 7 intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_max
 8 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_pwt__sem_none
 9 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_pwt__sem_av
10 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_pwt__sem_max
11 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_ptw__sem_none
12 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_ptw__sem_av
13 intratext_coherence_score__tt_vw__cm_seg_length__wtrt_ptw__sem_max
14 intratext_coherence_score__tt_vw__cm_sow__wtrt_pwt__sem_none
15 intratext_coherence_score__tt_vw__cm

In [71]:
len(all_scores[2]._word_cooccurrences)

465882

In [80]:
len(COOC_VALUES2)

466196

In [None]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:50<20:02, 50.11s/it][A
  8%|▊         | 2/25 [02:58<28:09, 73.47s/it][A
 12%|█▏        | 3/25 [03:43<23:52, 65.14s/it][A
 16%|█▌        | 4/25 [04:30<20:52, 59.63s/it][A
 20%|██        | 5/25 [05:18<18:40, 56.03s/it][A
 24%|██▍       | 6/25 [06:03<16:45, 52.90s/it][A
 28%|██▊       | 7/25 [06:49<15:13, 50.74s/it][A
 32%|███▏      | 8/25 [07:35<13:56, 49.19s/it][A
 36%|███▌      | 9/25 [08:20<12:49, 48.09s/it][A
 40%|████      | 10/25 [09:06<11:50, 47.36s/it][A
 44%|████▍     | 11/25 [09:51<10:54, 46.78s/it][A
 48%|████▊     | 12/25 [10:37<10:03, 46.39s/it][A
 52%|█████▏    | 13/25 [11:23<09:15, 46.25s/it][A
 56%|█████▌    | 14/25 [12:58<11:11, 61.09s/it][A
 60%|██████    | 15/25 [14:34<11:55, 71.54s/it][A
 64%|██████▍   | 16/25 [16:11<11:50, 79.00s/it][A
 68%|██████▊   | 17/25 [17:47<11:13, 84.15s/it][A
 72%|███████▏  | 18/25 [19:23<10:14, 87.77s/it][A
 76%|█████

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:39<00:00, 75.97s/it] [A
  5%|▌         | 1/19 [40:20<12:06:11, 2420.65s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:50<20:03, 50.14s/it][A
  8%|▊         | 2/25 [02:58<28:10, 73.51s/it][A
 12%|█▏        | 3/25 [03:43<23:53, 65.15s/it][A
 16%|█▌        | 4/25 [04:29<20:46, 59.36s/it][A
 20%|██        | 5/25 [05:15<18:24, 55.22s/it][A
 24%|██▍       | 6/25 [06:00<16:33, 52.31s/it][A
 28%|██▊       | 7/25 [06:46<15:05, 50.33s/it][A
 32%|███▏      | 8/25 [07:32<13:51, 48.90s/it][A
 36%|███▌      | 9/25 [08:17<12:45, 47.87s/it][A
 40%|████      | 10/25 [09:03<11:48, 47.23s/it][A
 44%|████▍     | 11/25 [09:48<10:54, 46.77s/it][A
 48%|████▊     | 12/25 [10:34<10:03, 46.42s/it][A
 52%|█████▏    | 13/25 [11:20<09:16, 46.36s/it][A
 56%|█████▌    | 14/25 [12:57<11:15, 61.38s/it][A
 60%|██████    | 15/25 [14:33<11:57, 71.80s/it][A
 64%|██████▍   | 16/25 [16:10<11:54, 79.38s/it][A
 68%|██████▊   | 17/25 [17:47<11:17, 84.66s/it

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:38<00:00, 75.95s/it] [A
 11%|█         | 2/19 [1:21:15<11:28:47, 2431.00s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:10, 45.45s/it][A
  8%|▊         | 2/25 [02:53<26:56, 70.28s/it][A
 12%|█▏        | 3/25 [03:39<23:04, 62.93s/it][A
 16%|█▌        | 4/25 [04:25<20:13, 57.79s/it][A
 20%|██        | 5/25 [05:10<18:02, 54.11s/it][A
 24%|██▍       | 6/25 [05:56<16:19, 51.57s/it][A
 28%|██▊       | 7/25 [06:42<14:56, 49.80s/it][A
 32%|███▏      | 8/25 [07:27<13:44, 48.49s/it][A
 36%|███▌      | 9/25 [08:12<12:41, 47.57s/it][A
 40%|████      | 10/25 [08:58<11:45, 47.04s/it][A
 44%|████▍     | 11/25 [09:44<10:53, 46.67s/it][A
 48%|████▊     | 12/25 [10:30<10:03, 46.39s/it][A
 52%|█████▏    | 13/25 [11:16<09:14, 46.22s/it][A
 56%|█████▌    | 14/25 [12:52<11:13, 61.25s/it][A
 60%|██████    | 15/25 [14:28<11:56, 71.63s/it][A
 64%|██████▍   | 16/25 [16:04<11:51, 79.04s/it][A
 68%|██████▊   | 17/25 [17:40<11:12, 84.11s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:47<00:00, 76.29s/it] [A
 16%|█▌        | 3/19 [2:02:19<10:50:50, 2440.67s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:47<18:49, 47.07s/it][A
  8%|▊         | 2/25 [02:55<27:21, 71.35s/it][A
 12%|█▏        | 3/25 [03:49<24:18, 66.30s/it][A
 16%|█▌        | 4/25 [04:35<21:02, 60.14s/it][A
 20%|██        | 5/25 [05:21<18:36, 55.83s/it][A
 24%|██▍       | 6/25 [06:08<16:51, 53.23s/it][A
 28%|██▊       | 7/25 [06:54<15:18, 51.02s/it][A
 32%|███▏      | 8/25 [07:39<14:00, 49.42s/it][A
 36%|███▌      | 9/25 [08:25<12:53, 48.34s/it][A
 40%|████      | 10/25 [09:11<11:53, 47.59s/it][A
 44%|████▍     | 11/25 [09:57<10:58, 47.04s/it][A
 48%|████▊     | 12/25 [10:42<10:06, 46.62s/it][A
 52%|█████▏    | 13/25 [11:28<09:16, 46.35s/it][A
 56%|█████▌    | 14/25 [13:04<11:13, 61.27s/it][A
 60%|██████    | 15/25 [14:42<12:02, 72.21s/it][A
 64%|██████▍   | 16/25 [16:18<11:54, 79.42s/it][A
 68%|██████▊   | 17/25 [17:55<11:16, 84.57s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:47<00:00, 76.30s/it] [A
 21%|██        | 4/19 [2:43:23<10:11:58, 2447.92s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:13, 45.55s/it][A
  8%|▊         | 2/25 [02:53<26:59, 70.40s/it][A
 12%|█▏        | 3/25 [03:45<23:42, 64.66s/it][A
 16%|█▌        | 4/25 [04:32<20:50, 59.54s/it][A
 20%|██        | 5/25 [05:18<18:28, 55.44s/it][A
 24%|██▍       | 6/25 [06:04<16:37, 52.50s/it][A
 28%|██▊       | 7/25 [06:50<15:09, 50.51s/it][A
 32%|███▏      | 8/25 [07:35<13:53, 49.02s/it][A
 36%|███▌      | 9/25 [08:21<12:47, 47.97s/it][A
 40%|████      | 10/25 [09:06<11:48, 47.24s/it][A
 44%|████▍     | 11/25 [09:52<10:54, 46.73s/it][A
 48%|████▊     | 12/25 [10:37<10:02, 46.36s/it][A
 52%|█████▏    | 13/25 [11:23<09:13, 46.14s/it][A
 56%|█████▌    | 14/25 [12:58<11:10, 60.97s/it][A
 60%|██████    | 15/25 [14:35<11:57, 71.72s/it][A
 64%|██████▍   | 16/25 [16:12<11:52, 79.17s/it][A
 68%|██████▊   | 17/25 [17:47<11:12, 84.08s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:41<00:00, 76.06s/it] [A
 26%|██▋       | 5/19 [3:24:26<9:32:11, 2452.22s/it] 
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:10, 45.43s/it][A
  8%|▊         | 2/25 [02:53<26:53, 70.17s/it][A
 12%|█▏        | 3/25 [03:38<23:01, 62.81s/it][A
 16%|█▌        | 4/25 [04:24<20:13, 57.76s/it][A
 20%|██        | 5/25 [05:10<18:01, 54.10s/it][A
 24%|██▍       | 6/25 [05:56<16:19, 51.55s/it][A
 28%|██▊       | 7/25 [06:42<14:58, 49.90s/it][A
 32%|███▏      | 8/25 [07:27<13:47, 48.66s/it][A
 36%|███▌      | 9/25 [08:13<12:44, 47.78s/it][A
 40%|████      | 10/25 [08:59<11:48, 47.22s/it][A
 44%|████▍     | 11/25 [09:45<10:54, 46.75s/it][A
 48%|████▊     | 12/25 [10:30<10:03, 46.43s/it][A
 52%|█████▏    | 13/25 [11:16<09:14, 46.22s/it][A
 56%|█████▌    | 14/25 [12:58<11:30, 62.77s/it][A
 60%|██████    | 15/25 [14:33<12:05, 72.53s/it][A
 64%|██████▍   | 16/25 [16:08<11:53, 79.25s/it][A
 68%|██████▊   | 17/25 [17:43<11:12, 84.00s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:33<00:00, 75.74s/it] [A
 32%|███▏      | 6/19 [4:05:17<8:51:16, 2452.05s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:13, 45.54s/it][A
  8%|▊         | 2/25 [02:54<27:01, 70.51s/it][A
 12%|█▏        | 3/25 [03:39<23:07, 63.06s/it][A
 16%|█▌        | 4/25 [04:25<20:15, 57.87s/it][A
 20%|██        | 5/25 [05:11<18:03, 54.19s/it][A
 24%|██▍       | 6/25 [05:57<16:21, 51.65s/it][A
 28%|██▊       | 7/25 [06:42<14:58, 49.92s/it][A
 32%|███▏      | 8/25 [07:28<13:47, 48.69s/it][A
 36%|███▌      | 9/25 [08:14<12:45, 47.82s/it][A
 40%|████      | 10/25 [09:00<11:48, 47.23s/it][A
 44%|████▍     | 11/25 [09:46<10:55, 46.79s/it][A
 48%|████▊     | 12/25 [10:31<10:04, 46.47s/it][A
 52%|█████▏    | 13/25 [11:17<09:14, 46.24s/it][A
 56%|█████▌    | 14/25 [12:57<11:24, 62.19s/it][A
 60%|██████    | 15/25 [14:32<12:01, 72.16s/it][A
 64%|██████▍   | 16/25 [16:07<11:51, 79.06s/it][A
 68%|██████▊   | 17/25 [17:42<11:10, 83.86s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:58<00:00, 76.72s/it] [A
 37%|███▋      | 7/19 [4:46:33<8:11:48, 2459.04s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:10, 45.42s/it][A
  8%|▊         | 2/25 [02:53<26:52, 70.11s/it][A
 12%|█▏        | 3/25 [03:38<23:00, 62.73s/it][A
 16%|█▌        | 4/25 [04:24<20:10, 57.63s/it][A
 20%|██        | 5/25 [05:10<18:00, 54.04s/it][A
 24%|██▍       | 6/25 [05:55<16:19, 51.53s/it][A
 28%|██▊       | 7/25 [06:41<14:56, 49.82s/it][A
 32%|███▏      | 8/25 [07:27<13:45, 48.57s/it][A
 36%|███▌      | 9/25 [08:12<12:43, 47.71s/it][A
 40%|████      | 10/25 [08:58<11:48, 47.21s/it][A
 44%|████▍     | 11/25 [09:44<10:55, 46.79s/it][A
 48%|████▊     | 12/25 [10:30<10:04, 46.53s/it][A
 52%|█████▏    | 13/25 [11:16<09:16, 46.36s/it][A
 56%|█████▌    | 14/25 [12:52<11:14, 61.30s/it][A
 60%|██████    | 15/25 [14:29<11:59, 71.94s/it][A
 64%|██████▍   | 16/25 [16:05<11:52, 79.17s/it][A
 68%|██████▊   | 17/25 [17:42<11:16, 84.53s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:39<00:00, 75.99s/it] [A
 42%|████▏     | 8/19 [5:27:29<7:30:38, 2458.09s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:11, 45.48s/it][A
  8%|▊         | 2/25 [02:53<26:53, 70.15s/it][A
 12%|█▏        | 3/25 [03:38<23:00, 62.77s/it][A
 16%|█▌        | 4/25 [04:24<20:09, 57.59s/it][A
 20%|██        | 5/25 [05:10<18:01, 54.05s/it][A
 24%|██▍       | 6/25 [05:55<16:19, 51.54s/it][A
 28%|██▊       | 7/25 [06:41<14:58, 49.89s/it][A
 32%|███▏      | 8/25 [07:27<13:46, 48.63s/it][A
 36%|███▌      | 9/25 [08:13<12:43, 47.73s/it][A
 40%|████      | 10/25 [08:59<11:48, 47.22s/it][A
 44%|████▍     | 11/25 [09:44<10:54, 46.77s/it][A
 48%|████▊     | 12/25 [10:30<10:03, 46.43s/it][A
 52%|█████▏    | 13/25 [11:16<09:14, 46.23s/it][A
 56%|█████▌    | 14/25 [12:52<11:15, 61.37s/it][A
 60%|██████    | 15/25 [14:28<11:56, 71.66s/it][A
 64%|██████▍   | 16/25 [16:05<11:52, 79.18s/it][A
 68%|██████▊   | 17/25 [17:41<11:14, 84.29s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:37<00:00, 75.91s/it] [A
 47%|████▋     | 9/19 [6:08:20<6:49:21, 2456.15s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:09, 45.39s/it][A
  8%|▊         | 2/25 [02:53<26:56, 70.30s/it][A
 12%|█▏        | 3/25 [03:39<23:05, 62.98s/it][A
 16%|█▌        | 4/25 [04:25<20:13, 57.79s/it][A
 20%|██        | 5/25 [05:11<18:02, 54.14s/it][A
 24%|██▍       | 6/25 [05:58<16:32, 52.23s/it][A
 28%|██▊       | 7/25 [06:49<15:34, 51.90s/it][A
 32%|███▏      | 8/25 [07:35<14:10, 50.03s/it][A
 36%|███▌      | 9/25 [08:21<12:58, 48.68s/it][A
 40%|████      | 10/25 [09:07<11:58, 47.88s/it][A
 44%|████▍     | 11/25 [09:53<11:02, 47.30s/it][A
 48%|████▊     | 12/25 [10:38<10:09, 46.86s/it][A
 52%|█████▏    | 13/25 [11:24<09:18, 46.51s/it][A
 56%|█████▌    | 14/25 [13:01<11:19, 61.73s/it][A
 60%|██████    | 15/25 [14:40<12:06, 72.67s/it][A
 64%|██████▍   | 16/25 [16:18<12:04, 80.54s/it][A
 68%|██████▊   | 17/25 [17:58<11:30, 86.28s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [32:00<00:00, 76.83s/it] [A
 53%|█████▎    | 10/19 [6:49:39<6:09:27, 2463.01s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:14, 45.59s/it][A
  8%|▊         | 2/25 [02:52<26:51, 70.08s/it][A
 12%|█▏        | 3/25 [03:38<22:59, 62.69s/it][A
 16%|█▌        | 4/25 [04:26<20:22, 58.24s/it][A
 20%|██        | 5/25 [05:14<18:24, 55.24s/it][A
 24%|██▍       | 6/25 [06:00<16:35, 52.42s/it][A
 28%|██▊       | 7/25 [06:46<15:08, 50.47s/it][A
 32%|███▏      | 8/25 [07:32<13:54, 49.12s/it][A
 36%|███▌      | 9/25 [08:17<12:49, 48.09s/it][A
 40%|████      | 10/25 [09:04<11:53, 47.53s/it][A
 44%|████▍     | 11/25 [09:49<10:58, 47.03s/it][A
 48%|████▊     | 12/25 [10:35<10:06, 46.66s/it][A
 52%|█████▏    | 13/25 [11:21<09:17, 46.49s/it][A
 56%|█████▌    | 14/25 [13:08<11:51, 64.66s/it][A
 60%|██████    | 15/25 [14:45<12:22, 74.24s/it][A
 64%|██████▍   | 16/25 [16:29<12:28, 83.12s/it][A
 68%|██████▊   | 17/25 [18:06<11:38, 87.33s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [32:08<00:00, 77.13s/it] [A
 58%|█████▊    | 11/19 [7:31:04<5:29:17, 2469.71s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:14, 45.60s/it][A
  8%|▊         | 2/25 [02:53<26:59, 70.39s/it][A
 12%|█▏        | 3/25 [03:39<23:05, 62.98s/it][A
 16%|█▌        | 4/25 [04:25<20:14, 57.82s/it][A
 20%|██        | 5/25 [05:11<18:03, 54.19s/it][A
 24%|██▍       | 6/25 [05:56<16:21, 51.67s/it][A
 28%|██▊       | 7/25 [06:42<14:58, 49.90s/it][A
 32%|███▏      | 8/25 [07:28<13:47, 48.67s/it][A
 36%|███▌      | 9/25 [08:13<12:43, 47.69s/it][A
 40%|████      | 10/25 [08:59<11:46, 47.09s/it][A
 44%|████▍     | 11/25 [09:45<10:53, 46.65s/it][A
 48%|████▊     | 12/25 [10:30<10:02, 46.38s/it][A
 52%|█████▏    | 13/25 [11:16<09:13, 46.15s/it][A
 56%|█████▌    | 14/25 [12:52<11:11, 61.03s/it][A
 60%|██████    | 15/25 [14:28<11:56, 71.61s/it][A
 64%|██████▍   | 16/25 [16:05<11:52, 79.19s/it][A
 68%|██████▊   | 17/25 [17:41<11:15, 84.38s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:35<00:00, 75.81s/it] [A
 63%|██████▎   | 12/19 [8:11:58<4:47:34, 2464.87s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:10, 45.44s/it][A
  8%|▊         | 2/25 [02:54<27:04, 70.62s/it][A
 12%|█▏        | 3/25 [03:40<23:07, 63.05s/it][A
 16%|█▌        | 4/25 [04:25<20:13, 57.81s/it][A
 20%|██        | 5/25 [05:10<18:00, 54.02s/it][A
 24%|██▍       | 6/25 [05:59<16:32, 52.25s/it][A
 28%|██▊       | 7/25 [06:44<15:05, 50.31s/it][A
 32%|███▏      | 8/25 [07:30<13:51, 48.89s/it][A
 36%|███▌      | 9/25 [08:25<13:30, 50.68s/it][A
 40%|████      | 10/25 [09:11<12:18, 49.26s/it][A
 44%|████▍     | 11/25 [09:56<11:14, 48.16s/it][A
 48%|████▊     | 12/25 [10:42<10:15, 47.37s/it][A
 52%|█████▏    | 13/25 [11:37<09:57, 49.78s/it][A
 56%|█████▌    | 14/25 [13:17<11:50, 64.62s/it][A
 60%|██████    | 15/25 [14:52<12:20, 74.03s/it][A
 64%|██████▍   | 16/25 [16:38<12:30, 83.43s/it][A
 68%|██████▊   | 17/25 [18:14<11:38, 87.29s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [32:11<00:00, 77.27s/it] [A
 68%|██████▊   | 13/19 [8:53:26<4:07:11, 2471.88s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:46<18:36, 46.54s/it][A
  8%|▊         | 2/25 [02:54<27:13, 71.00s/it][A
 12%|█▏        | 3/25 [03:41<23:19, 63.62s/it][A
 16%|█▌        | 4/25 [04:26<20:24, 58.31s/it][A
 20%|██        | 5/25 [05:12<18:10, 54.52s/it][A
 24%|██▍       | 6/25 [05:58<16:25, 51.87s/it][A

In [78]:
1

1

In [198]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:38<15:35, 38.99s/it][A
  8%|▊         | 2/25 [02:40<24:26, 63.76s/it][A
 12%|█▏        | 3/25 [03:18<20:30, 55.91s/it][A
 16%|█▌        | 4/25 [03:55<17:39, 50.45s/it][A
 20%|██        | 5/25 [04:33<15:31, 46.57s/it][A
 24%|██▍       | 6/25 [05:10<13:51, 43.79s/it][A
 28%|██▊       | 7/25 [05:48<12:34, 41.91s/it][A
 32%|███▏      | 8/25 [06:25<11:30, 40.60s/it][A
 36%|███▌      | 9/25 [07:03<10:34, 39.67s/it][A
 40%|████      | 10/25 [07:40<09:45, 39.06s/it][A
 44%|████▍     | 11/25 [08:18<09:01, 38.66s/it][A
 48%|████▊     | 12/25 [08:56<08:18, 38.38s/it][A
 52%|█████▏    | 13/25 [09:33<07:37, 38.15s/it][A
 56%|█████▌    | 14/25 [10:41<08:35, 46.87s/it][A
 60%|██████    | 15/25 [11:48<08:49, 52.94s/it][A
 64%|██████▍   | 16/25 [12:56<08:36, 57.39s/it][A
 68%|██████▊   | 17/25 [14:04<08:05, 60.67s/it][A
 72%|███████▏  | 18/25 [15:12<07:20, 62.99s/it][A
 76%|█████

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:25<00:00, 63.41s/it] [A
  5%|▌         | 1/20 [27:37<8:45:00, 1657.91s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:38<15:20, 38.37s/it][A
  8%|▊         | 2/25 [02:39<24:15, 63.29s/it][A
 12%|█▏        | 3/25 [03:17<20:25, 55.72s/it][A
 16%|█▌        | 4/25 [03:56<17:41, 50.53s/it][A
 20%|██        | 5/25 [04:34<15:35, 46.77s/it][A
 24%|██▍       | 6/25 [05:12<13:57, 44.08s/it][A
 28%|██▊       | 7/25 [05:49<12:40, 42.23s/it][A
 32%|███▏      | 8/25 [06:27<11:35, 40.91s/it][A
 36%|███▌      | 9/25 [07:06<10:44, 40.30s/it][A
 40%|████      | 10/25 [07:44<09:54, 39.60s/it][A
 44%|████▍     | 11/25 [08:22<09:06, 39.06s/it][A
 48%|████▊     | 12/25 [08:59<08:21, 38.58s/it][A
 52%|█████▏    | 13/25 [09:37<07:40, 38.36s/it][A
 56%|█████▌    | 14/25 [10:45<08:39, 47.23s/it][A
 60%|██████    | 15/25 [11:53<08:53, 53.36s/it][A
 64%|██████▍   | 16/25 [13:01<08:40, 57.85s/it][A
 68%|██████▊   | 17/25 [14:09<08:07, 60.90s/it]

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:27<00:00, 63.51s/it] [A
 10%|█         | 2/20 [55:33<8:19:00, 1663.35s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:08, 37.85s/it][A
  8%|▊         | 2/25 [02:37<23:56, 62.47s/it][A
 12%|█▏        | 3/25 [03:15<20:10, 55.04s/it][A
 16%|█▌        | 4/25 [03:53<17:26, 49.82s/it][A
 20%|██        | 5/25 [04:30<15:23, 46.17s/it][A
 24%|██▍       | 6/25 [05:08<13:48, 43.60s/it][A
 28%|██▊       | 7/25 [05:45<12:32, 41.80s/it][A
 32%|███▏      | 8/25 [06:23<11:29, 40.57s/it][A
 36%|███▌      | 9/25 [07:01<10:35, 39.70s/it][A
 40%|████      | 10/25 [07:38<09:46, 39.09s/it][A
 44%|████▍     | 11/25 [08:16<09:01, 38.67s/it][A
 48%|████▊     | 12/25 [08:54<08:18, 38.32s/it][A
 52%|█████▏    | 13/25 [09:31<07:36, 38.08s/it][A
 56%|█████▌    | 14/25 [10:40<08:40, 47.27s/it][A
 60%|██████    | 15/25 [11:48<08:55, 53.54s/it][A
 64%|██████▍   | 16/25 [12:57<08:44, 58.29s/it][A
 68%|██████▊   | 17/25 [14:08<08:16, 62.09s/it]

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:27<00:00, 63.49s/it] [A
 15%|█▌        | 3/20 [1:23:28<7:52:13, 1666.67s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:09, 37.88s/it][A
  8%|▊         | 2/25 [02:38<24:02, 62.71s/it][A
 12%|█▏        | 3/25 [03:16<20:16, 55.29s/it][A
 16%|█▌        | 4/25 [03:54<17:32, 50.14s/it][A
 20%|██        | 5/25 [04:32<15:30, 46.55s/it][A
 24%|██▍       | 6/25 [05:10<13:55, 43.97s/it][A
 28%|██▊       | 7/25 [05:48<12:39, 42.17s/it][A
 32%|███▏      | 8/25 [06:26<11:34, 40.87s/it][A
 36%|███▌      | 9/25 [07:04<10:41, 40.10s/it][A
 40%|████      | 10/25 [07:42<09:51, 39.44s/it][A
 44%|████▍     | 11/25 [08:20<09:05, 38.96s/it][A
 48%|████▊     | 12/25 [08:58<08:20, 38.51s/it][A
 52%|█████▏    | 13/25 [09:36<07:40, 38.37s/it][A
 56%|█████▌    | 14/25 [10:44<08:41, 47.38s/it][A
 60%|██████    | 15/25 [11:51<08:53, 53.32s/it][A
 64%|██████▍   | 16/25 [13:00<08:41, 57.99s/it][A
 68%|██████▊   | 17/25 [14:07<08:06, 60.75s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:24<00:00, 63.37s/it] [A
 20%|██        | 4/20 [1:51:20<7:24:53, 1668.36s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:38<15:12, 38.03s/it][A
  8%|▊         | 2/25 [02:38<24:05, 62.85s/it][A
 12%|█▏        | 3/25 [03:16<20:16, 55.30s/it][A
 16%|█▌        | 4/25 [03:54<17:30, 50.03s/it][A
 20%|██        | 5/25 [04:31<15:27, 46.36s/it][A
 24%|██▍       | 6/25 [05:09<13:51, 43.78s/it][A
 28%|██▊       | 7/25 [05:47<12:36, 42.02s/it][A
 32%|███▏      | 8/25 [06:25<11:33, 40.80s/it][A
 36%|███▌      | 9/25 [07:03<10:37, 39.85s/it][A
 40%|████      | 10/25 [07:40<09:48, 39.20s/it][A
 44%|████▍     | 11/25 [08:19<09:05, 38.97s/it][A
 48%|████▊     | 12/25 [08:57<08:23, 38.76s/it][A
 52%|█████▏    | 13/25 [09:35<07:40, 38.39s/it][A
 56%|█████▌    | 14/25 [10:42<08:37, 47.00s/it][A
 60%|██████    | 15/25 [11:50<08:52, 53.23s/it][A
 64%|██████▍   | 16/25 [12:58<08:40, 57.85s/it][A
 68%|██████▊   | 17/25 [14:06<08:05, 60.70s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:43<00:00, 64.12s/it] [A
 25%|██▌       | 5/20 [2:19:29<6:58:37, 1674.53s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:07, 45.32s/it][A
  8%|▊         | 2/25 [02:52<26:47, 69.90s/it][A
 12%|█▏        | 3/25 [03:37<22:54, 62.46s/it][A
 16%|█▌        | 4/25 [04:22<20:02, 57.26s/it][A
 20%|██        | 5/25 [05:07<17:52, 53.62s/it][A
 24%|██▍       | 6/25 [05:53<16:10, 51.09s/it][A
 28%|██▊       | 7/25 [06:38<14:46, 49.26s/it][A
 32%|███▏      | 8/25 [07:16<13:02, 46.02s/it][A
 36%|███▌      | 9/25 [07:54<11:35, 43.49s/it][A
 40%|████      | 10/25 [08:31<10:26, 41.74s/it][A
 44%|████▍     | 11/25 [09:09<09:28, 40.57s/it][A
 48%|████▊     | 12/25 [09:47<08:35, 39.67s/it][A
 52%|█████▏    | 13/25 [10:26<07:54, 39.55s/it][A
 56%|█████▌    | 14/25 [11:34<08:50, 48.22s/it][A
 60%|██████    | 15/25 [12:44<09:05, 54.57s/it][A
 64%|██████▍   | 16/25 [13:52<08:47, 58.63s/it][A
 68%|██████▊   | 17/25 [15:00<08:10, 61.31s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [27:16<00:00, 65.45s/it] [A
 30%|███       | 6/20 [2:48:13<6:34:11, 1689.41s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:07, 37.81s/it][A
  8%|▊         | 2/25 [02:38<24:01, 62.66s/it][A
 12%|█▏        | 3/25 [03:16<20:15, 55.27s/it][A
 16%|█▌        | 4/25 [03:54<17:29, 49.98s/it][A
 20%|██        | 5/25 [04:31<15:24, 46.23s/it][A
 24%|██▍       | 6/25 [05:09<13:49, 43.65s/it][A
 28%|██▊       | 7/25 [05:48<12:40, 42.23s/it][A
 32%|███▏      | 8/25 [06:27<11:41, 41.27s/it][A
 36%|███▌      | 9/25 [07:06<10:50, 40.66s/it][A
 40%|████      | 10/25 [07:45<10:02, 40.16s/it][A
 44%|████▍     | 11/25 [08:24<09:17, 39.84s/it][A
 48%|████▊     | 12/25 [09:03<08:34, 39.59s/it][A
 52%|█████▏    | 13/25 [09:42<07:53, 39.42s/it][A
 56%|█████▌    | 14/25 [10:51<08:50, 48.20s/it][A
 60%|██████    | 15/25 [12:01<09:09, 54.96s/it][A
 64%|██████▍   | 16/25 [13:10<08:50, 59.00s/it][A
 68%|██████▊   | 17/25 [14:18<08:14, 61.83s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:39<00:00, 63.98s/it] [A
 35%|███▌      | 7/20 [3:16:20<6:05:53, 1688.74s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:06, 37.78s/it][A
  8%|▊         | 2/25 [02:39<24:06, 62.88s/it][A
 12%|█▏        | 3/25 [03:17<20:19, 55.44s/it][A
 16%|█▌        | 4/25 [03:55<17:33, 50.18s/it][A
 20%|██        | 5/25 [04:33<15:30, 46.52s/it][A
 24%|██▍       | 6/25 [05:11<13:55, 43.96s/it][A
 28%|██▊       | 7/25 [05:48<12:37, 42.11s/it][A
 32%|███▏      | 8/25 [06:26<11:33, 40.80s/it][A
 36%|███▌      | 9/25 [07:04<10:37, 39.85s/it][A
 40%|████      | 10/25 [07:41<09:47, 39.17s/it][A
 44%|████▍     | 11/25 [08:20<09:04, 38.89s/it][A
 48%|████▊     | 12/25 [08:57<08:20, 38.49s/it][A
 52%|█████▏    | 13/25 [09:35<07:38, 38.24s/it][A
 56%|█████▌    | 14/25 [10:43<08:39, 47.19s/it][A
 60%|██████    | 15/25 [11:50<08:52, 53.23s/it][A
 64%|██████▍   | 16/25 [12:57<08:36, 57.42s/it][A
 68%|██████▊   | 17/25 [14:05<08:04, 60.55s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:25<00:00, 63.41s/it] [A
 40%|████      | 8/20 [3:44:13<5:36:46, 1683.84s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:02, 37.62s/it][A
  8%|▊         | 2/25 [02:37<23:52, 62.28s/it][A
 12%|█▏        | 3/25 [03:15<20:08, 54.94s/it][A
 16%|█▌        | 4/25 [03:53<17:25, 49.80s/it][A
 20%|██        | 5/25 [04:30<15:22, 46.11s/it][A
 24%|██▍       | 6/25 [05:08<13:46, 43.52s/it][A
 28%|██▊       | 7/25 [05:45<12:30, 41.71s/it][A
 32%|███▏      | 8/25 [06:22<11:27, 40.44s/it][A
 36%|███▌      | 9/25 [07:00<10:33, 39.57s/it][A
 40%|████      | 10/25 [07:38<09:44, 38.98s/it][A
 44%|████▍     | 11/25 [08:15<08:59, 38.53s/it][A
 48%|████▊     | 12/25 [08:52<08:16, 38.17s/it][A
 52%|█████▏    | 13/25 [09:30<07:35, 38.00s/it][A
 56%|█████▌    | 14/25 [10:38<08:37, 47.06s/it][A
 60%|██████    | 15/25 [11:47<08:55, 53.55s/it][A
 64%|██████▍   | 16/25 [12:55<08:41, 57.97s/it][A
 68%|██████▊   | 17/25 [14:02<08:05, 60.74s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:22<00:00, 63.30s/it] [A
 45%|████▌     | 9/20 [4:12:02<5:07:55, 1679.58s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:06, 37.78s/it][A
  8%|▊         | 2/25 [02:37<23:56, 62.48s/it][A
 12%|█▏        | 3/25 [03:15<20:10, 55.03s/it][A
 16%|█▌        | 4/25 [03:53<17:26, 49.84s/it][A
 20%|██        | 5/25 [04:31<15:25, 46.29s/it][A
 24%|██▍       | 6/25 [05:08<13:50, 43.71s/it][A
 28%|██▊       | 7/25 [05:46<12:34, 41.90s/it][A
 32%|███▏      | 8/25 [06:24<11:30, 40.63s/it][A
 36%|███▌      | 9/25 [07:02<10:35, 39.75s/it][A
 40%|████      | 10/25 [07:39<09:47, 39.14s/it][A
 44%|████▍     | 11/25 [08:17<09:02, 38.72s/it][A
 48%|████▊     | 12/25 [08:54<08:18, 38.34s/it][A
 52%|█████▏    | 13/25 [09:32<07:37, 38.13s/it][A
 56%|█████▌    | 14/25 [10:40<08:37, 47.08s/it][A
 60%|██████    | 15/25 [11:49<08:56, 53.61s/it][A
 64%|██████▍   | 16/25 [12:58<08:43, 58.17s/it][A
 68%|██████▊   | 17/25 [14:07<08:11, 61.47s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:38<00:00, 63.93s/it] [A
 50%|█████     | 10/20 [4:40:07<4:40:10, 1681.05s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:05, 37.71s/it][A
  8%|▊         | 2/25 [02:40<24:11, 63.09s/it][A
 12%|█▏        | 3/25 [03:17<20:21, 55.51s/it][A
 16%|█▌        | 4/25 [03:56<17:39, 50.47s/it][A
 20%|██        | 5/25 [04:35<15:37, 46.87s/it][A
 24%|██▍       | 6/25 [05:13<14:00, 44.21s/it][A
 28%|██▊       | 7/25 [05:51<12:46, 42.56s/it][A
 32%|███▏      | 8/25 [06:29<11:37, 41.01s/it][A
 36%|███▌      | 9/25 [07:06<10:39, 39.99s/it][A
 40%|████      | 10/25 [07:44<09:49, 39.32s/it][A
 44%|████▍     | 11/25 [08:22<09:03, 38.85s/it][A
 48%|████▊     | 12/25 [08:59<08:20, 38.47s/it][A
 52%|█████▏    | 13/25 [09:37<07:38, 38.24s/it][A
 56%|█████▌    | 14/25 [10:45<08:39, 47.23s/it][A
 60%|██████    | 15/25 [11:53<08:54, 53.43s/it][A
 64%|██████▍   | 16/25 [13:02<08:41, 57.92s/it][A
 68%|██████▊   | 17/25 [14:09<08:05, 60.64s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:29<00:00, 63.56s/it] [A
 55%|█████▌    | 11/20 [5:07:45<4:11:06, 1674.04s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:01, 45.05s/it][A
  8%|▊         | 2/25 [02:46<26:05, 68.05s/it][A
 12%|█▏        | 3/25 [03:24<21:37, 58.99s/it][A
 16%|█▌        | 4/25 [04:02<18:25, 52.65s/it][A
 20%|██        | 5/25 [04:40<16:03, 48.17s/it][A
 24%|██▍       | 6/25 [05:17<14:14, 44.97s/it][A
 28%|██▊       | 7/25 [05:55<12:50, 42.79s/it][A
 32%|███▏      | 8/25 [06:33<11:42, 41.32s/it][A
 36%|███▌      | 9/25 [07:11<10:43, 40.25s/it][A
 40%|████      | 10/25 [07:49<09:53, 39.59s/it][A
 44%|████▍     | 11/25 [08:26<09:07, 39.08s/it][A
 48%|████▊     | 12/25 [09:04<08:21, 38.61s/it][A
 52%|█████▏    | 13/25 [09:42<07:39, 38.30s/it][A
 56%|█████▌    | 14/25 [10:49<08:37, 47.05s/it][A
 60%|██████    | 15/25 [11:57<08:51, 53.19s/it][A
 64%|██████▍   | 16/25 [13:06<08:41, 57.99s/it][A
 68%|██████▊   | 17/25 [14:13<08:07, 60.89s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:31<00:00, 63.65s/it] [A
 60%|██████    | 12/20 [5:35:25<3:42:38, 1669.81s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:38<15:25, 38.56s/it][A
  8%|▊         | 2/25 [02:38<24:10, 63.06s/it][A
 12%|█▏        | 3/25 [03:23<21:08, 57.68s/it][A
 16%|█▌        | 4/25 [04:08<18:48, 53.76s/it][A
 20%|██        | 5/25 [04:53<17:01, 51.06s/it][A
 24%|██▍       | 6/25 [05:30<14:52, 46.98s/it][A
 28%|██▊       | 7/25 [06:08<13:15, 44.17s/it][A
 32%|███▏      | 8/25 [06:46<11:57, 42.23s/it][A
 36%|███▌      | 9/25 [07:23<10:53, 40.82s/it][A
 40%|████      | 10/25 [08:01<09:59, 39.94s/it][A
 44%|████▍     | 11/25 [08:39<09:11, 39.42s/it][A
 48%|████▊     | 12/25 [09:17<08:26, 38.97s/it][A
 52%|█████▏    | 13/25 [09:55<07:44, 38.69s/it][A
 56%|█████▌    | 14/25 [11:04<08:45, 47.75s/it][A
 60%|██████    | 15/25 [12:11<08:55, 53.53s/it][A
 64%|██████▍   | 16/25 [13:19<08:40, 57.79s/it][A
 68%|██████▊   | 17/25 [14:27<08:06, 60.81s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:49<00:00, 64.38s/it] [A
 65%|██████▌   | 13/20 [6:03:22<3:15:05, 1672.16s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:01, 37.57s/it][A
  8%|▊         | 2/25 [02:38<23:59, 62.61s/it][A
 12%|█▏        | 3/25 [03:16<20:16, 55.31s/it][A
 16%|█▌        | 4/25 [03:54<17:30, 50.03s/it][A
 20%|██        | 5/25 [04:32<15:27, 46.38s/it][A
 24%|██▍       | 6/25 [05:10<13:51, 43.75s/it][A
 28%|██▊       | 7/25 [05:47<12:34, 41.94s/it][A
 32%|███▏      | 8/25 [06:25<11:30, 40.62s/it][A
 36%|███▌      | 9/25 [07:02<10:35, 39.70s/it][A
 40%|████      | 10/25 [07:40<09:46, 39.10s/it][A
 44%|████▍     | 11/25 [08:18<09:01, 38.67s/it][A
 48%|████▊     | 12/25 [08:55<08:18, 38.35s/it][A
 52%|█████▏    | 13/25 [09:33<07:37, 38.12s/it][A
 56%|█████▌    | 14/25 [10:40<08:36, 46.93s/it][A
 60%|██████    | 15/25 [11:47<08:49, 52.95s/it][A
 64%|██████▍   | 16/25 [12:55<08:37, 57.45s/it][A
 68%|██████▊   | 17/25 [14:03<08:03, 60.43s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:26<00:00, 63.47s/it] [A
 70%|███████   | 14/20 [6:30:57<2:46:41, 1666.98s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:01, 37.55s/it][A
  8%|▊         | 2/25 [02:38<23:56, 62.44s/it][A
 12%|█▏        | 3/25 [03:15<20:08, 54.93s/it][A
 16%|█▌        | 4/25 [03:53<17:23, 49.71s/it][A
 20%|██        | 5/25 [04:30<15:21, 46.05s/it][A
 24%|██▍       | 6/25 [05:07<13:45, 43.45s/it][A
 28%|██▊       | 7/25 [05:45<12:29, 41.63s/it][A
 32%|███▏      | 8/25 [06:22<11:26, 40.36s/it][A
 36%|███▌      | 9/25 [06:59<10:31, 39.44s/it][A
 40%|████      | 10/25 [07:37<09:42, 38.82s/it][A
 44%|████▍     | 11/25 [08:14<08:58, 38.45s/it][A
 48%|████▊     | 12/25 [08:52<08:17, 38.24s/it][A
 52%|█████▏    | 13/25 [09:30<07:35, 37.99s/it][A
 56%|█████▌    | 14/25 [10:36<08:32, 46.62s/it][A
 60%|██████    | 15/25 [11:44<08:48, 52.83s/it][A
 64%|██████▍   | 16/25 [12:51<08:33, 57.07s/it][A
 68%|██████▊   | 17/25 [13:57<07:59, 59.96s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:13<00:00, 62.92s/it] [A
 75%|███████▌  | 15/20 [6:58:18<2:18:15, 1659.11s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:06, 37.77s/it][A
  8%|▊         | 2/25 [02:37<23:56, 62.46s/it][A
 12%|█▏        | 3/25 [03:15<20:10, 55.01s/it][A
 16%|█▌        | 4/25 [03:53<17:26, 49.83s/it][A
 20%|██        | 5/25 [04:30<15:23, 46.19s/it][A
 24%|██▍       | 6/25 [05:08<13:47, 43.56s/it][A
 28%|██▊       | 7/25 [05:45<12:32, 41.79s/it][A
 32%|███▏      | 8/25 [06:23<11:28, 40.51s/it][A
 36%|███▌      | 9/25 [07:01<10:33, 39.60s/it][A
 40%|████      | 10/25 [07:38<09:44, 38.99s/it][A
 44%|████▍     | 11/25 [08:16<09:00, 38.59s/it][A
 48%|████▊     | 12/25 [08:53<08:17, 38.24s/it][A
 52%|█████▏    | 13/25 [09:31<07:37, 38.09s/it][A
 56%|█████▌    | 14/25 [10:38<08:34, 46.77s/it][A
 60%|██████    | 15/25 [11:45<08:47, 52.76s/it][A
 64%|██████▍   | 16/25 [12:52<08:34, 57.21s/it][A
 68%|██████▊   | 17/25 [13:59<08:01, 60.15s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:13<00:00, 62.92s/it] [A
 80%|████████  | 16/20 [7:25:39<1:50:14, 1653.65s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:05, 37.73s/it][A
  8%|▊         | 2/25 [02:38<23:59, 62.60s/it][A
 12%|█▏        | 3/25 [03:15<20:12, 55.10s/it][A
 16%|█▌        | 4/25 [03:53<17:27, 49.88s/it][A
 20%|██        | 5/25 [04:31<15:24, 46.23s/it][A
 24%|██▍       | 6/25 [05:08<13:48, 43.60s/it][A
 28%|██▊       | 7/25 [05:46<12:32, 41.78s/it][A
 32%|███▏      | 8/25 [06:23<11:28, 40.50s/it][A
 36%|███▌      | 9/25 [07:01<10:32, 39.54s/it][A
 40%|████      | 10/25 [07:38<09:43, 38.88s/it][A
 44%|████▍     | 11/25 [08:15<08:58, 38.45s/it][A
 48%|████▊     | 12/25 [08:53<08:16, 38.17s/it][A
 52%|█████▏    | 13/25 [09:31<07:36, 38.02s/it][A
 56%|█████▌    | 14/25 [10:38<08:34, 46.80s/it][A
 60%|██████    | 15/25 [11:46<08:50, 53.07s/it][A
 64%|██████▍   | 16/25 [12:54<08:37, 57.53s/it][A
 68%|██████▊   | 17/25 [14:02<08:05, 60.72s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:17<00:00, 63.11s/it] [A
 85%|████████▌ | 17/20 [7:53:05<1:22:34, 1651.42s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:03, 37.63s/it][A
  8%|▊         | 2/25 [02:37<23:55, 62.39s/it][A
 12%|█▏        | 3/25 [03:15<20:08, 54.95s/it][A
 16%|█▌        | 4/25 [03:52<17:24, 49.73s/it][A
 20%|██        | 5/25 [04:30<15:22, 46.11s/it][A
 24%|██▍       | 6/25 [05:08<13:47, 43.54s/it][A
 28%|██▊       | 7/25 [05:45<12:32, 41.80s/it][A
 32%|███▏      | 8/25 [06:30<12:05, 42.70s/it][A
 36%|███▌      | 9/25 [07:09<11:02, 41.42s/it][A
 40%|████      | 10/25 [07:46<10:04, 40.30s/it][A
 44%|████▍     | 11/25 [08:24<09:12, 39.49s/it][A
 48%|████▊     | 12/25 [09:01<08:25, 38.86s/it][A
 52%|█████▏    | 13/25 [09:39<07:41, 38.47s/it][A
 56%|█████▌    | 14/25 [10:46<08:36, 46.97s/it][A
 60%|██████    | 15/25 [11:54<08:54, 53.41s/it][A
 64%|██████▍   | 16/25 [13:01<08:37, 57.46s/it][A
 68%|██████▊   | 17/25 [14:08<08:02, 60.28s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:22<00:00, 63.31s/it] [A
 90%|█████████ | 18/20 [8:20:36<55:02, 1651.24s/it]  
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:03, 37.64s/it][A
  8%|▊         | 2/25 [02:38<24:01, 62.65s/it][A
 12%|█▏        | 3/25 [03:16<20:13, 55.16s/it][A
 16%|█▌        | 4/25 [03:53<17:27, 49.88s/it][A
 20%|██        | 5/25 [04:31<15:24, 46.25s/it][A
 24%|██▍       | 6/25 [05:09<13:48, 43.62s/it][A
 28%|██▊       | 7/25 [05:46<12:33, 41.84s/it][A
 32%|███▏      | 8/25 [06:24<11:29, 40.53s/it][A
 36%|███▌      | 9/25 [07:01<10:34, 39.63s/it][A
 40%|████      | 10/25 [07:39<09:45, 39.04s/it][A
 44%|████▍     | 11/25 [08:17<09:02, 38.73s/it][A
 48%|████▊     | 12/25 [08:54<08:18, 38.34s/it][A
 52%|█████▏    | 13/25 [09:32<07:38, 38.22s/it][A
 56%|█████▌    | 14/25 [10:40<08:37, 47.09s/it][A
 60%|██████    | 15/25 [11:49<08:55, 53.51s/it][A
 64%|██████▍   | 16/25 [12:57<08:40, 57.82s/it][A
 68%|██████▊   | 17/25 [14:04<08:05, 60.69s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:23<00:00, 63.35s/it] [A
 95%|█████████▌| 19/20 [8:48:07<27:31, 1651.37s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:37<15:04, 37.68s/it][A
  8%|▊         | 2/25 [02:38<23:58, 62.55s/it][A
 12%|█▏        | 3/25 [03:16<20:14, 55.20s/it][A
 16%|█▌        | 4/25 [03:53<17:27, 49.89s/it][A
 20%|██        | 5/25 [04:31<15:24, 46.20s/it][A
 24%|██▍       | 6/25 [05:09<13:48, 43.63s/it][A
 28%|██▊       | 7/25 [05:46<12:32, 41.83s/it][A
 32%|███▏      | 8/25 [06:24<11:30, 40.59s/it][A
 36%|███▌      | 9/25 [07:01<10:34, 39.66s/it][A
 40%|████      | 10/25 [07:39<09:45, 39.06s/it][A
 44%|████▍     | 11/25 [08:17<09:00, 38.60s/it][A
 48%|████▊     | 12/25 [08:54<08:17, 38.30s/it][A
 52%|█████▏    | 13/25 [09:32<07:37, 38.10s/it][A
 56%|█████▌    | 14/25 [10:41<08:42, 47.54s/it][A
 60%|██████    | 15/25 [11:49<08:56, 53.62s/it][A
 64%|██████▍   | 16/25 [12:57<08:41, 57.98s/it][A
 68%|██████▊   | 17/25 [14:05<08:07, 60.96s/it

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [26:23<00:00, 63.32s/it] [A
100%|██████████| 20/20 [9:15:39<00:00, 1666.95s/it]
CPU times: user 11h 1min 59s, sys: 1h 12min 32s, total: 12h 14min 32s
Wall time: 9h 15min 39s


In [None]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:48<19:23, 48.47s/it][A
  8%|▊         | 2/25 [02:59<28:02, 73.16s/it][A
 12%|█▏        | 3/25 [03:49<24:15, 66.17s/it][A
 16%|█▌        | 4/25 [04:39<21:29, 61.43s/it][A
 20%|██        | 5/25 [05:27<19:10, 57.54s/it][A
 24%|██▍       | 6/25 [06:16<17:19, 54.71s/it][A
 28%|██▊       | 7/25 [07:05<15:54, 53.00s/it][A
 32%|███▏      | 8/25 [07:55<14:47, 52.18s/it][A
 36%|███▌      | 9/25 [08:44<13:39, 51.24s/it][A
 40%|████      | 10/25 [09:32<12:35, 50.40s/it][A
 44%|████▍     | 11/25 [10:20<11:35, 49.68s/it][A
 48%|████▊     | 12/25 [11:08<10:39, 49.22s/it][A
 52%|█████▏    | 13/25 [11:58<09:50, 49.23s/it][A
 56%|█████▌    | 14/25 [13:45<12:12, 66.56s/it][A
 60%|██████    | 15/25 [15:35<13:15, 79.59s/it][A
 64%|██████▍   | 16/25 [17:22<13:11, 87.97s/it][A
 68%|██████▊   | 17/25 [19:10<12:31, 94.00s/it][A
 72%|███████▏  | 18/25 [20:57<11:25, 97.94s/it][A
 76%|█████

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [33:34<00:00, 80.58s/it] [A
  5%|▌         | 1/20 [36:09<11:26:58, 2169.38s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:48<19:17, 48.22s/it][A
  8%|▊         | 2/25 [03:00<28:05, 73.30s/it][A
 12%|█▏        | 3/25 [03:48<24:08, 65.83s/it][A
 16%|█▌        | 4/25 [04:36<21:11, 60.54s/it][A
 20%|██        | 5/25 [05:24<18:56, 56.81s/it][A
 24%|██▍       | 6/25 [06:12<17:10, 54.22s/it][A
 28%|██▊       | 7/25 [07:01<15:44, 52.46s/it][A
 32%|███▏      | 8/25 [07:49<14:30, 51.18s/it][A
 36%|███▌      | 9/25 [08:37<13:23, 50.22s/it][A
 40%|████      | 10/25 [09:25<12:23, 49.59s/it][A
 44%|████▍     | 11/25 [10:13<11:27, 49.14s/it][A
 48%|████▊     | 12/25 [11:01<10:34, 48.80s/it][A
 52%|█████▏    | 13/25 [11:50<09:46, 48.89s/it][A
 56%|█████▌    | 14/25 [13:38<12:11, 66.54s/it][A
 60%|██████    | 15/25 [15:26<13:09, 78.99s/it][A
 64%|██████▍   | 16/25 [17:15<13:10, 87.85s/it][A
 68%|██████▊   | 17/25 [19:02<12:29, 93.71s/it

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [33:22<00:00, 80.11s/it] [A
 10%|█         | 2/20 [1:12:40<10:52:44, 2175.78s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:48<19:13, 48.04s/it][A
  8%|▊         | 2/25 [02:58<27:54, 72.80s/it][A
 12%|█▏        | 3/25 [03:48<24:07, 65.79s/it][A
 16%|█▌        | 4/25 [04:36<21:10, 60.52s/it][A
 20%|██        | 5/25 [05:24<18:56, 56.83s/it][A
 24%|██▍       | 6/25 [06:12<17:10, 54.23s/it][A
 28%|██▊       | 7/25 [07:00<15:42, 52.38s/it][A
 32%|███▏      | 8/25 [07:48<14:28, 51.10s/it][A
 36%|███▌      | 9/25 [08:36<13:23, 50.20s/it][A
 40%|████      | 10/25 [09:25<12:24, 49.64s/it][A
 44%|████▍     | 11/25 [10:15<11:39, 49.96s/it][A
 48%|████▊     | 12/25 [11:04<10:42, 49.41s/it][A
 52%|█████▏    | 13/25 [11:52<09:48, 49.02s/it][A
 56%|█████▌    | 14/25 [13:39<12:11, 66.48s/it][A
 60%|██████    | 15/25 [15:27<13:09, 78.99s/it][A
 64%|██████▍   | 16/25 [17:14<13:07, 87.47s/it][A
 68%|██████▊   | 17/25 [19:01<12:25, 93.19s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [33:26<00:00, 80.25s/it] [A
 15%|█▌        | 3/20 [1:49:22<10:18:42, 2183.67s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:50<20:06, 50.28s/it][A
  8%|▊         | 2/25 [03:00<28:30, 74.38s/it][A
 12%|█▏        | 3/25 [03:49<24:28, 66.77s/it][A
 16%|█▌        | 4/25 [04:38<21:28, 61.36s/it][A
 20%|██        | 5/25 [05:36<20:03, 60.18s/it][A
 24%|██▍       | 6/25 [06:24<17:58, 56.76s/it][A
 28%|██▊       | 7/25 [07:13<16:18, 54.34s/it][A
 32%|███▏      | 8/25 [08:01<14:52, 52.50s/it][A
 36%|███▌      | 9/25 [08:50<13:42, 51.39s/it][A
 40%|████      | 10/25 [09:40<12:44, 51.00s/it][A
 44%|████▍     | 11/25 [10:56<13:39, 58.56s/it][A
 48%|████▊     | 12/25 [11:55<12:41, 58.60s/it][A
 52%|█████▏    | 13/25 [13:05<12:24, 62.05s/it][A
 56%|█████▌    | 14/25 [15:12<14:57, 81.62s/it][A
 60%|██████    | 15/25 [17:10<15:23, 92.34s/it][A
 64%|██████▍   | 16/25 [18:57<14:32, 96.93s/it][A
 68%|██████▊   | 17/25 [20:46<13:23, 100.44s

In [65]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:42<17:05, 42.72s/it][A
  8%|▊         | 2/25 [02:36<24:32, 64.01s/it][A
 12%|█▏        | 3/25 [03:22<21:28, 58.56s/it][A
 16%|█▌        | 4/25 [04:05<18:52, 53.94s/it][A
 20%|██        | 5/25 [04:48<16:53, 50.66s/it][A
 24%|██▍       | 6/25 [05:31<15:17, 48.29s/it][A
 28%|██▊       | 7/25 [06:16<14:11, 47.29s/it][A
 32%|███▏      | 8/25 [07:07<13:43, 48.43s/it][A
 36%|███▌      | 9/25 [07:58<13:08, 49.26s/it][A
 40%|████      | 10/25 [08:43<12:01, 48.13s/it][A
 44%|████▍     | 11/25 [09:26<10:50, 46.45s/it][A
 48%|████▊     | 12/25 [10:09<09:50, 45.42s/it][A
 52%|█████▏    | 13/25 [10:54<09:01, 45.17s/it][A
 56%|█████▌    | 14/25 [12:33<11:16, 61.52s/it][A
 60%|██████    | 15/25 [14:09<11:57, 71.79s/it][A
 64%|██████▍   | 16/25 [15:44<11:49, 78.84s/it][A
 68%|██████▊   | 17/25 [17:30<11:34, 86.87s/it][A
 72%|███████▏  | 18/25 [19:05<10:25, 89.31s/it][A
 76%|█████

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:10<00:00, 72.41s/it] [A
  5%|▌         | 1/20 [32:50<10:24:00, 1970.57s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:42<17:02, 42.61s/it][A
  8%|▊         | 2/25 [02:35<24:25, 63.70s/it][A
 12%|█▏        | 3/25 [03:21<21:25, 58.41s/it][A
 16%|█▌        | 4/25 [04:06<19:01, 54.38s/it][A
 20%|██        | 5/25 [04:49<16:57, 50.87s/it][A
 24%|██▍       | 6/25 [05:32<15:23, 48.60s/it][A
 28%|██▊       | 7/25 [06:18<14:22, 47.90s/it][A
 32%|███▏      | 8/25 [07:06<13:31, 47.74s/it][A
 36%|███▌      | 9/25 [07:49<12:22, 46.39s/it][A
 40%|████      | 10/25 [08:32<11:18, 45.26s/it][A
 44%|████▍     | 11/25 [09:15<10:23, 44.57s/it][A
 48%|████▊     | 12/25 [09:57<09:31, 44.00s/it][A
 52%|█████▏    | 13/25 [10:40<08:42, 43.55s/it][A
 56%|█████▌    | 14/25 [12:16<10:53, 59.37s/it][A
 60%|██████    | 15/25 [13:51<11:40, 70.05s/it][A
 64%|██████▍   | 16/25 [15:28<11:43, 78.20s/it][A
 68%|██████▊   | 17/25 [17:03<11:06, 83.32s/it

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [29:44<00:00, 71.36s/it] [A
 10%|█         | 2/20 [1:05:49<9:51:54, 1973.03s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:42<17:05, 42.73s/it][A
  8%|▊         | 2/25 [02:37<24:37, 64.23s/it][A
 12%|█▏        | 3/25 [03:20<21:12, 57.85s/it][A
 16%|█▌        | 4/25 [04:04<18:49, 53.79s/it][A
 20%|██        | 5/25 [04:46<16:47, 50.38s/it][A
 24%|██▍       | 6/25 [05:29<15:15, 48.16s/it][A
 28%|██▊       | 7/25 [06:14<14:08, 47.13s/it][A
 32%|███▏      | 8/25 [06:57<13:01, 45.98s/it][A
 36%|███▌      | 9/25 [07:41<12:02, 45.15s/it][A
 40%|████      | 10/25 [08:24<11:11, 44.77s/it][A
 44%|████▍     | 11/25 [09:15<10:52, 46.59s/it][A
 48%|████▊     | 12/25 [09:59<09:54, 45.73s/it][A
 52%|█████▏    | 13/25 [10:42<08:59, 44.97s/it][A
 56%|█████▌    | 14/25 [12:17<10:57, 59.79s/it][A
 60%|██████    | 15/25 [13:56<11:56, 71.64s/it][A
 64%|██████▍   | 16/25 [15:31<11:48, 78.76s/it][A
 68%|██████▊   | 17/25 [17:09<11:14, 84.37s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:03<00:00, 74.52s/it] [A
 15%|█▌        | 3/20 [1:40:17<9:27:07, 2001.59s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:55<22:18, 55.78s/it][A
  8%|▊         | 2/25 [02:56<28:48, 75.17s/it][A
 12%|█▏        | 3/25 [03:44<24:36, 67.09s/it][A
 16%|█▌        | 4/25 [04:27<21:00, 60.00s/it][A
 20%|██        | 5/25 [05:10<18:14, 54.73s/it][A
 24%|██▍       | 6/25 [05:53<16:12, 51.17s/it][A
 28%|██▊       | 7/25 [06:39<14:55, 49.75s/it][A
 32%|███▏      | 8/25 [07:31<14:14, 50.29s/it][A
 36%|███▌      | 9/25 [08:14<12:51, 48.25s/it][A
 40%|████      | 10/25 [08:58<11:45, 47.06s/it][A
 44%|████▍     | 11/25 [09:46<11:01, 47.22s/it][A
 48%|████▊     | 12/25 [10:29<09:58, 46.01s/it][A
 52%|█████▏    | 13/25 [11:17<09:19, 46.66s/it][A
 56%|█████▌    | 14/25 [12:53<11:14, 61.31s/it][A
 60%|██████    | 15/25 [14:27<11:52, 71.22s/it][A
 64%|██████▍   | 16/25 [16:00<11:38, 77.63s/it][A
 68%|██████▊   | 17/25 [17:39<11:13, 84.15s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:24<00:00, 72.98s/it] [A
 20%|██        | 4/20 [2:14:53<8:59:42, 2023.88s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:44<17:55, 44.82s/it][A
  8%|▊         | 2/25 [02:50<26:28, 69.06s/it][A
 12%|█▏        | 3/25 [03:44<23:43, 64.68s/it][A
 16%|█▌        | 4/25 [04:45<22:14, 63.53s/it][A
 20%|██        | 5/25 [05:51<21:24, 64.24s/it][A
 24%|██▍       | 6/25 [06:38<18:44, 59.17s/it][A
 28%|██▊       | 7/25 [07:23<16:28, 54.89s/it][A
 32%|███▏      | 8/25 [08:08<14:42, 51.92s/it][A
 36%|███▌      | 9/25 [08:58<13:38, 51.13s/it][A
 40%|████      | 10/25 [09:56<13:18, 53.22s/it][A
 44%|████▍     | 11/25 [10:59<13:07, 56.26s/it][A
 48%|████▊     | 12/25 [11:48<11:44, 54.16s/it][A
 52%|█████▏    | 13/25 [12:31<10:08, 50.72s/it][A
 56%|█████▌    | 14/25 [14:53<14:19, 78.13s/it][A
 60%|██████    | 15/25 [17:06<15:45, 94.57s/it][A
 64%|██████▍   | 16/25 [19:31<16:27, 109.70s/it][A
 68%|██████▊   | 17/25 [22:02<16:16, 122.06s

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [35:27<00:00, 85.09s/it] [A
 25%|██▌       | 5/20 [2:53:37<8:48:26, 2113.79s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:44<17:45, 44.40s/it][A
  8%|▊         | 2/25 [02:42<25:26, 66.36s/it][A
 12%|█▏        | 3/25 [03:28<22:07, 60.35s/it][A
 16%|█▌        | 4/25 [04:13<19:32, 55.84s/it][A
 20%|██        | 5/25 [04:58<17:28, 52.41s/it][A
 24%|██▍       | 6/25 [05:43<15:55, 50.27s/it][A
 28%|██▊       | 7/25 [06:29<14:41, 48.96s/it][A
 32%|███▏      | 8/25 [07:12<13:22, 47.21s/it][A
 36%|███▌      | 9/25 [07:54<12:12, 45.76s/it][A
 40%|████      | 10/25 [08:38<11:18, 45.26s/it][A
 44%|████▍     | 11/25 [09:21<10:22, 44.48s/it][A
 48%|████▊     | 12/25 [10:09<09:52, 45.54s/it][A
 52%|█████▏    | 13/25 [10:52<08:58, 44.85s/it][A
 56%|█████▌    | 14/25 [12:34<11:19, 61.77s/it][A
 60%|██████    | 15/25 [14:17<12:23, 74.31s/it][A
 64%|██████▍   | 16/25 [15:55<12:13, 81.49s/it][A
 68%|██████▊   | 17/25 [17:32<11:29, 86.16s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:23<00:00, 72.94s/it] [A
 30%|███       | 6/20 [3:27:21<8:06:57, 2086.97s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:23, 43.48s/it][A
  8%|▊         | 2/25 [02:39<24:57, 65.12s/it][A
 12%|█▏        | 3/25 [03:23<21:34, 58.82s/it][A
 16%|█▌        | 4/25 [04:07<19:02, 54.40s/it][A
 20%|██        | 5/25 [04:58<17:47, 53.38s/it][A
 24%|██▍       | 6/25 [05:44<16:13, 51.22s/it][A
 28%|██▊       | 7/25 [06:38<15:37, 52.10s/it][A
 32%|███▏      | 8/25 [07:25<14:16, 50.39s/it][A
 36%|███▌      | 9/25 [08:08<12:50, 48.19s/it][A
 40%|████      | 10/25 [08:54<11:55, 47.73s/it][A
 44%|████▍     | 11/25 [09:39<10:56, 46.86s/it][A
 48%|████▊     | 12/25 [10:23<09:58, 46.01s/it][A
 52%|█████▏    | 13/25 [11:11<09:18, 46.51s/it][A
 56%|█████▌    | 14/25 [12:48<11:18, 61.65s/it][A
 60%|██████    | 15/25 [14:28<12:11, 73.20s/it][A
 64%|██████▍   | 16/25 [16:07<12:09, 81.04s/it][A
 68%|██████▊   | 17/25 [17:50<11:39, 87.43s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:43<00:00, 73.74s/it] [A
 35%|███▌      | 7/20 [4:01:18<7:28:54, 2071.90s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:12, 45.53s/it][A
  8%|▊         | 2/25 [02:38<25:15, 65.87s/it][A
 12%|█▏        | 3/25 [03:24<21:57, 59.90s/it][A
 16%|█▌        | 4/25 [04:10<19:26, 55.53s/it][A
 20%|██        | 5/25 [04:53<17:17, 51.86s/it][A
 24%|██▍       | 6/25 [05:38<15:47, 49.87s/it][A
 28%|██▊       | 7/25 [06:24<14:33, 48.54s/it][A
 32%|███▏      | 8/25 [07:15<14:00, 49.43s/it][A
 36%|███▌      | 9/25 [08:05<13:10, 49.41s/it][A
 40%|████      | 10/25 [08:53<12:19, 49.27s/it][A
 44%|████▍     | 11/25 [09:38<11:10, 47.89s/it][A
 48%|████▊     | 12/25 [10:21<10:03, 46.42s/it][A
 52%|█████▏    | 13/25 [11:08<09:20, 46.70s/it][A
 56%|█████▌    | 14/25 [12:52<11:40, 63.72s/it][A
 60%|██████    | 15/25 [14:32<12:26, 74.64s/it][A
 64%|██████▍   | 16/25 [16:12<12:20, 82.27s/it][A
 68%|██████▊   | 17/25 [17:51<11:37, 87.19s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:39<00:00, 73.58s/it] [A
 40%|████      | 8/20 [4:35:11<6:52:04, 2060.40s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:30, 43.79s/it][A
  8%|▊         | 2/25 [02:40<25:07, 65.53s/it][A
 12%|█▏        | 3/25 [03:30<22:19, 60.90s/it][A
 16%|█▌        | 4/25 [04:25<20:41, 59.12s/it][A
 20%|██        | 5/25 [05:09<18:14, 54.71s/it][A
 24%|██▍       | 6/25 [06:01<17:05, 54.00s/it][A
 28%|██▊       | 7/25 [06:45<15:13, 50.76s/it][A
 32%|███▏      | 8/25 [07:28<13:46, 48.63s/it][A
 36%|███▌      | 9/25 [08:17<12:59, 48.71s/it][A
 40%|████      | 10/25 [09:02<11:53, 47.60s/it][A
 44%|████▍     | 11/25 [09:50<11:05, 47.53s/it][A
 48%|████▊     | 12/25 [10:38<10:20, 47.70s/it][A
 52%|█████▏    | 13/25 [11:21<09:16, 46.35s/it][A
 56%|█████▌    | 14/25 [13:05<11:40, 63.66s/it][A
 60%|██████    | 15/25 [14:45<12:25, 74.52s/it][A
 64%|██████▍   | 16/25 [16:26<12:23, 82.64s/it][A
 68%|██████▊   | 17/25 [18:03<11:35, 86.93s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:53<00:00, 74.16s/it] [A
 45%|████▌     | 9/20 [5:09:17<6:16:55, 2055.93s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:44<17:50, 44.62s/it][A
  8%|▊         | 2/25 [02:43<25:36, 66.82s/it][A
 12%|█▏        | 3/25 [03:27<22:02, 60.10s/it][A
 16%|█▌        | 4/25 [04:11<19:18, 55.17s/it][A
 20%|██        | 5/25 [04:54<17:09, 51.47s/it][A
 24%|██▍       | 6/25 [05:36<15:27, 48.80s/it][A
 28%|██▊       | 7/25 [06:19<14:06, 47.03s/it][A
 32%|███▏      | 8/25 [07:07<13:22, 47.21s/it][A
 36%|███▌      | 9/25 [07:50<12:13, 45.87s/it][A
 40%|████      | 10/25 [08:34<11:19, 45.33s/it][A
 44%|████▍     | 11/25 [09:17<10:24, 44.64s/it][A
 48%|████▊     | 12/25 [10:02<09:42, 44.84s/it][A
 52%|█████▏    | 13/25 [10:47<09:00, 45.06s/it][A
 56%|█████▌    | 14/25 [12:28<11:18, 61.72s/it][A
 60%|██████    | 15/25 [14:08<12:11, 73.15s/it][A
 64%|██████▍   | 16/25 [15:45<12:03, 80.36s/it][A
 68%|██████▊   | 17/25 [17:26<11:31, 86.42s/i

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:23<00:00, 72.95s/it] [A
 50%|█████     | 10/20 [5:42:58<5:40:56, 2045.63s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:42<17:10, 42.95s/it][A
  8%|▊         | 2/25 [02:38<24:51, 64.87s/it][A
 12%|█▏        | 3/25 [03:23<21:29, 58.62s/it][A
 16%|█▌        | 4/25 [04:10<19:20, 55.25s/it][A
 20%|██        | 5/25 [05:01<17:57, 53.87s/it][A
 24%|██▍       | 6/25 [05:45<16:08, 50.95s/it][A
 28%|██▊       | 7/25 [06:36<15:18, 51.04s/it][A
 32%|███▏      | 8/25 [07:19<13:49, 48.78s/it][A
 36%|███▌      | 9/25 [08:04<12:38, 47.38s/it][A
 40%|████      | 10/25 [08:48<11:37, 46.49s/it][A
 44%|████▍     | 11/25 [09:31<10:37, 45.51s/it][A
 48%|████▊     | 12/25 [10:14<09:41, 44.72s/it][A
 52%|█████▏    | 13/25 [10:58<08:52, 44.38s/it][A
 56%|█████▌    | 14/25 [12:37<11:10, 60.91s/it][A
 60%|██████    | 15/25 [14:16<12:03, 72.36s/it][A
 64%|██████▍   | 16/25 [15:59<12:14, 81.60s/it][A
 68%|██████▊   | 17/25 [17:49<12:00, 90.08s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:49<00:00, 73.97s/it] [A
 55%|█████▌    | 11/20 [6:16:59<5:06:37, 2044.17s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [01:07<27:03, 67.63s/it][A
  8%|▊         | 2/25 [03:05<31:39, 82.60s/it][A
 12%|█▏        | 3/25 [03:48<26:00, 70.95s/it][A
 16%|█▌        | 4/25 [04:37<22:30, 64.31s/it][A
 20%|██        | 5/25 [05:21<19:20, 58.02s/it][A
 24%|██▍       | 6/25 [06:09<17:29, 55.23s/it][A
 28%|██▊       | 7/25 [06:53<15:29, 51.66s/it][A
 32%|███▏      | 8/25 [07:42<14:27, 51.05s/it][A
 36%|███▌      | 9/25 [08:26<13:02, 48.94s/it][A
 40%|████      | 10/25 [09:12<11:58, 47.89s/it][A
 44%|████▍     | 11/25 [09:56<10:55, 46.82s/it][A
 48%|████▊     | 12/25 [10:40<09:57, 45.99s/it][A
 52%|█████▏    | 13/25 [11:24<09:04, 45.42s/it][A
 56%|█████▌    | 14/25 [13:01<11:09, 60.84s/it][A
 60%|██████    | 15/25 [14:47<12:23, 74.33s/it][A
 64%|██████▍   | 16/25 [16:26<12:16, 81.82s/it][A
 68%|██████▊   | 17/25 [18:21<12:14, 91.78s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:17<00:00, 75.10s/it] [A
 60%|██████    | 12/20 [6:51:28<4:33:33, 2051.68s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:45<18:14, 45.61s/it][A
  8%|▊         | 2/25 [02:40<25:23, 66.26s/it][A
 12%|█▏        | 3/25 [03:27<22:11, 60.54s/it][A
 16%|█▌        | 4/25 [04:19<20:17, 57.99s/it][A
 20%|██        | 5/25 [05:06<18:16, 54.85s/it][A
 24%|██▍       | 6/25 [05:51<16:23, 51.77s/it][A
 28%|██▊       | 7/25 [06:36<14:57, 49.88s/it][A
 32%|███▏      | 8/25 [07:23<13:51, 48.94s/it][A
 36%|███▌      | 9/25 [08:09<12:49, 48.11s/it][A
 40%|████      | 10/25 [08:53<11:39, 46.64s/it][A
 44%|████▍     | 11/25 [09:37<10:43, 45.96s/it][A
 48%|████▊     | 12/25 [10:22<09:55, 45.84s/it][A
 52%|█████▏    | 13/25 [11:08<09:08, 45.70s/it][A
 56%|█████▌    | 14/25 [12:51<11:32, 62.97s/it][A
 60%|██████    | 15/25 [14:32<12:23, 74.32s/it][A
 64%|██████▍   | 16/25 [16:14<12:23, 82.64s/it][A
 68%|██████▊   | 17/25 [17:59<11:54, 89.28s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:22<00:00, 75.30s/it] [A
 65%|██████▌   | 13/20 [7:26:06<4:00:15, 2059.34s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:41<16:44, 41.87s/it][A
  8%|▊         | 2/25 [02:34<24:12, 63.14s/it][A
 12%|█▏        | 3/25 [03:24<21:38, 59.03s/it][A
 16%|█▌        | 4/25 [04:06<18:57, 54.18s/it][A
 20%|██        | 5/25 [04:50<16:57, 50.87s/it][A
 24%|██▍       | 6/25 [05:35<15:36, 49.29s/it][A
 28%|██▊       | 7/25 [06:19<14:19, 47.73s/it][A
 32%|███▏      | 8/25 [07:02<13:05, 46.19s/it][A
 36%|███▌      | 9/25 [07:44<11:59, 44.96s/it][A
 40%|████      | 10/25 [08:27<11:05, 44.34s/it][A
 44%|████▍     | 11/25 [09:10<10:14, 43.88s/it][A
 48%|████▊     | 12/25 [09:52<09:24, 43.44s/it][A
 52%|█████▏    | 13/25 [10:37<08:47, 43.94s/it][A
 56%|█████▌    | 14/25 [12:27<11:41, 63.80s/it][A
 60%|██████    | 15/25 [14:13<12:44, 76.46s/it][A
 64%|██████▍   | 16/25 [16:06<13:05, 87.29s/it][A
 68%|██████▊   | 17/25 [17:55<12:31, 93.94s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:49<00:00, 73.98s/it] [A
 70%|███████   | 14/20 [8:00:09<3:25:26, 2054.47s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:28, 43.69s/it][A
  8%|▊         | 2/25 [02:58<27:16, 71.14s/it][A
 12%|█▏        | 3/25 [03:44<23:19, 63.61s/it][A
 16%|█▌        | 4/25 [04:27<20:05, 57.43s/it][A
 20%|██        | 5/25 [05:14<18:01, 54.07s/it][A
 24%|██▍       | 6/25 [05:58<16:12, 51.17s/it][A
 28%|██▊       | 7/25 [06:45<14:56, 49.80s/it][A
 32%|███▏      | 8/25 [07:29<13:38, 48.12s/it][A
 36%|███▌      | 9/25 [08:12<12:26, 46.65s/it][A
 40%|████      | 10/25 [08:56<11:25, 45.69s/it][A
 44%|████▍     | 11/25 [09:40<10:32, 45.18s/it][A
 48%|████▊     | 12/25 [10:23<09:40, 44.68s/it][A
 52%|█████▏    | 13/25 [11:08<08:55, 44.66s/it][A
 56%|█████▌    | 14/25 [12:53<11:32, 62.97s/it][A
 60%|██████    | 15/25 [14:30<12:10, 73.07s/it][A
 64%|██████▍   | 16/25 [16:08<12:05, 80.66s/it][A
 68%|██████▊   | 17/25 [17:46<11:26, 85.87s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:31<00:00, 73.27s/it] [A
 75%|███████▌  | 15/20 [8:33:55<2:50:30, 2046.10s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:13, 43.06s/it][A
  8%|▊         | 2/25 [02:36<24:36, 64.20s/it][A
 12%|█▏        | 3/25 [03:24<21:48, 59.46s/it][A
 16%|█▌        | 4/25 [04:15<19:54, 56.88s/it][A
 20%|██        | 5/25 [05:00<17:43, 53.16s/it][A
 24%|██▍       | 6/25 [05:42<15:50, 50.01s/it][A
 28%|██▊       | 7/25 [06:26<14:23, 47.95s/it][A
 32%|███▏      | 8/25 [07:18<13:59, 49.41s/it][A
 36%|███▌      | 9/25 [08:03<12:46, 47.90s/it][A
 40%|████      | 10/25 [08:51<12:00, 48.01s/it][A
 44%|████▍     | 11/25 [09:42<11:22, 48.78s/it][A
 48%|████▊     | 12/25 [10:31<10:35, 48.91s/it][A
 52%|█████▏    | 13/25 [11:21<09:51, 49.27s/it][A
 56%|█████▌    | 14/25 [13:06<12:05, 65.98s/it][A
 60%|██████    | 15/25 [14:48<12:48, 76.86s/it][A
 64%|██████▍   | 16/25 [16:34<12:51, 85.67s/it][A
 68%|██████▊   | 17/25 [18:20<12:11, 91.50s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:17<00:00, 75.09s/it] [A
 80%|████████  | 16/20 [9:08:24<2:16:52, 2053.06s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:18, 43.28s/it][A
  8%|▊         | 2/25 [02:36<24:35, 64.17s/it][A
 12%|█▏        | 3/25 [03:20<21:17, 58.07s/it][A
 16%|█▌        | 4/25 [04:03<18:46, 53.63s/it][A
 20%|██        | 5/25 [04:46<16:49, 50.50s/it][A
 24%|██▍       | 6/25 [05:36<15:58, 50.43s/it][A
 28%|██▊       | 7/25 [06:25<15:01, 50.06s/it][A
 32%|███▏      | 8/25 [07:11<13:47, 48.65s/it][A
 36%|███▌      | 9/25 [07:55<12:39, 47.46s/it][A
 40%|████      | 10/25 [08:39<11:36, 46.40s/it][A
 44%|████▍     | 11/25 [09:31<11:09, 47.81s/it][A
 48%|████▊     | 12/25 [10:14<10:06, 46.64s/it][A
 52%|█████▏    | 13/25 [10:58<09:10, 45.85s/it][A
 56%|█████▌    | 14/25 [12:44<11:42, 63.89s/it][A
 60%|██████    | 15/25 [14:31<12:48, 76.84s/it][A
 64%|██████▍   | 16/25 [16:09<12:27, 83.04s/it][A
 68%|██████▊   | 17/25 [17:46<11:38, 87.34s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:41<00:00, 73.67s/it] [A
 85%|████████▌ | 17/20 [9:42:45<1:42:45, 2055.24s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:32, 43.86s/it][A
  8%|▊         | 2/25 [02:39<25:02, 65.32s/it][A
 12%|█▏        | 3/25 [03:24<21:45, 59.34s/it][A
 16%|█▌        | 4/25 [04:15<19:55, 56.91s/it][A
 20%|██        | 5/25 [05:03<18:01, 54.08s/it][A
 24%|██▍       | 6/25 [05:47<16:11, 51.15s/it][A
 28%|██▊       | 7/25 [06:31<14:43, 49.09s/it][A
 32%|███▏      | 8/25 [07:28<14:33, 51.35s/it][A
 36%|███▌      | 9/25 [08:13<13:10, 49.43s/it][A
 40%|████      | 10/25 [08:57<11:59, 47.93s/it][A
 44%|████▍     | 11/25 [09:43<10:58, 47.07s/it][A
 48%|████▊     | 12/25 [10:26<09:57, 45.99s/it][A
 52%|█████▏    | 13/25 [11:15<09:24, 47.01s/it][A
 56%|█████▌    | 14/25 [12:54<11:27, 62.52s/it][A
 60%|██████    | 15/25 [14:32<12:11, 73.17s/it][A
 64%|██████▍   | 16/25 [16:10<12:04, 80.49s/it][A
 68%|██████▊   | 17/25 [17:50<11:31, 86.44s/

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [30:52<00:00, 74.11s/it] [A
 90%|█████████ | 18/20 [10:18:06<1:09:10, 2075.17s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:34, 43.96s/it][A
  8%|▊         | 2/25 [02:39<25:04, 65.42s/it][A
 12%|█▏        | 3/25 [03:27<22:02, 60.11s/it][A
 16%|█▌        | 4/25 [04:19<20:11, 57.69s/it][A
 20%|██        | 5/25 [05:03<17:55, 53.79s/it][A
 24%|██▍       | 6/25 [05:51<16:25, 51.87s/it][A
 28%|██▊       | 7/25 [06:39<15:14, 50.80s/it][A
 32%|███▏      | 8/25 [07:32<14:32, 51.33s/it][A
 36%|███▌      | 9/25 [08:17<13:11, 49.49s/it][A
 40%|████      | 10/25 [09:05<12:14, 48.96s/it][A
 44%|████▍     | 11/25 [09:49<11:05, 47.53s/it][A
 48%|████▊     | 12/25 [10:32<10:01, 46.30s/it][A
 52%|█████▏    | 13/25 [11:15<09:03, 45.33s/it][A
 56%|█████▌    | 14/25 [13:08<11:59, 65.42s/it][A
 60%|██████    | 15/25 [14:52<12:52, 77.24s/it][A
 64%|██████▍   | 16/25 [16:40<12:56, 86.25s/it][A
 68%|██████▊   | 17/25 [18:30<12:27, 93.47s

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:59<00:00, 76.79s/it] [A
 95%|█████████▌| 19/20 [10:53:17<34:45, 2085.86s/it]  
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:43<17:12, 43.00s/it][A
  8%|▊         | 2/25 [02:37<24:40, 64.37s/it][A
 12%|█▏        | 3/25 [03:22<21:30, 58.65s/it][A
 16%|█▌        | 4/25 [04:07<19:07, 54.63s/it][A
 20%|██        | 5/25 [04:51<17:05, 51.26s/it][A
 24%|██▍       | 6/25 [05:35<15:35, 49.24s/it][A
 28%|██▊       | 7/25 [06:20<14:19, 47.76s/it][A
 32%|███▏      | 8/25 [07:04<13:15, 46.79s/it][A
 36%|███▌      | 9/25 [07:50<12:23, 46.48s/it][A
 40%|████      | 10/25 [08:33<11:24, 45.60s/it][A
 44%|████▍     | 11/25 [09:25<11:03, 47.39s/it][A
 48%|████▊     | 12/25 [10:09<10:04, 46.49s/it][A
 52%|█████▏    | 13/25 [10:53<09:09, 45.78s/it][A
 56%|█████▌    | 14/25 [12:36<11:32, 62.96s/it][A
 60%|██████    | 15/25 [14:15<12:17, 73.75s/it][A
 64%|██████▍   | 16/25 [15:58<12:22, 82.53s/it][A
 68%|██████▊   | 17/25 [17:40<11:45, 88.17s

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [31:32<00:00, 75.72s/it] [A
100%|██████████| 20/20 [11:28:02<00:00, 2064.15s/it]
CPU times: user 14h 35min 25s, sys: 1h 46min 6s, total: 16h 21min 31s
Wall time: 11h 28min 3s


In [68]:
bank_topics = optimizer._topic_bank.view_topics()

In [85]:
bank_topics.shape

(2514, 15)

In [69]:
bank_topics.head()

Unnamed: 0,Unnamed: 1,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
@default_class,както,0.0,0.0,5.497794e-11,0.0,1.985204e-08,2.223492e-14,9e-06,0.0,0.0,0.0,0.0,0.0,0.0,2.414336e-06,4.583967e-11
@default_class,гравитационный,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022702,0.0,0.0,0.0,0.0,0.0
@default_class,жена,0.0,0.00139,0.0,0.0,0.0,0.0001217113,0.0,3.405896e-15,0.0,0.0,0.0,0.0,0.0,1.091175e-13,8.552548e-05
@default_class,продолжительность,0.0,0.0,0.0,3.47069e-15,0.0,0.0,0.0,0.0004282006,0.0,0.0,3.47461e-08,0.0,0.0,0.0,0.0
@default_class,одновременно,0.0,0.00075,5.159791e-06,7.805681e-09,3.03166e-06,5.124066e-09,0.000146,8.667931e-13,0.000935,0.000564,1.033011e-07,0.0,0.0,2.076379e-05,1.407077e-09


In [83]:
bank_topics['topic_14'].sort_values(ascending=False)[:20]

@default_class  век             0.229964
                xx              0.053898
                xix             0.051478
                начало          0.043227
                первый          0.038318
                конец           0.035413
                второй          0.019743
                середина        0.018641
                время           0.017661
                половина        0.017191
                xviii           0.013792
                классический    0.013726
                xvii            0.012661
                возникать       0.011614
                эпоха           0.010723
                хх              0.010398
                новый           0.010231
                столетие        0.008743
                образ           0.008709
                период          0.008326
Name: topic_14, dtype: float64

In [67]:
optimizer._topic_bank.view_topic_scores()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
kernel_size,229.0,408.0,102.0,157.0,235.0,302.0,345.0,347.0,283.0,391.0,338.0,59.0,121.0,407.0,214.0
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_none,0.129687,0.123335,0.468014,0.249535,0.211076,0.246103,0.084003,0.103003,0.149715,0.085349,0.102518,0.177264,0.116741,0.102491,0.099711
top_tokens_coherence_score__tt_vw__wtrt_pwt__sem_none,1.133654,0.66788,1.629793,0.835258,0.835292,0.806799,0.434416,0.921406,0.700194,0.409935,0.473712,0.614014,0.58399,0.500487,0.975982
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_av,0.127684,0.121866,0.462997,0.246569,0.207621,0.243621,0.082943,0.101184,0.147921,0.084183,0.101309,0.173868,0.113667,0.101379,0.0973
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_pwt__sem_max,0.120821,0.114016,0.462851,0.245817,0.116343,0.24557,0.079723,0.083172,0.143387,0.07581,0.096531,0.103924,0.100431,0.100097,0.080213
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_none,0.46368,0.783485,0.763705,0.481053,0.594415,0.914586,0.645213,0.466021,0.734765,0.76588,0.903625,0.845633,0.492089,0.642143,0.650835
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_av,0.45275,0.771956,0.753009,0.470381,0.58359,0.903922,0.633082,0.455416,0.723801,0.753463,0.890064,0.83087,0.479616,0.631021,0.636409
intratext_coherence_score__tt_vw__cm_seg_weight__wtrt_ptw__sem_max,0.385764,0.677018,0.71897,0.41817,0.345956,0.879365,0.549258,0.345888,0.661878,0.648797,0.772138,0.63636,0.417972,0.574683,0.525002
intratext_coherence_score__tt_vw__cm_seg_length__wtrt_pwt__sem_none,1.092971,1.152834,1.069599,1.067143,1.082481,1.066313,1.213043,1.060516,1.096398,1.241758,1.356061,1.476266,1.247272,1.112245,1.442545
intratext_coherence_score__tt_vw__cm_seg_length__wtrt_pwt__sem_av,1.092971,1.152834,1.069599,1.067143,1.082481,1.066313,1.213043,1.060516,1.096398,1.241758,1.356061,1.476266,1.247272,1.112245,1.442545


In [73]:
list(COOC_VALUES2.keys())[:1]

[(('@default_class', 'както'), ('@default_class', 'понимать'))]

In [75]:
COOC_VALUES2[(('@default_class', 'понимать'), ('@default_class', 'както'))]

KeyError: (('@default_class', 'понимать'), ('@default_class', 'както'))

In [82]:
list(COOC_VALUES2.keys())[:1]

[(('@default_class', 'както'), ('@default_class', 'понимать'))]

In [81]:
COOC_VALUES2[(('@default_class', 'нервный'), ('@default_class', 'растение'))]

KeyError: (('@default_class', 'нервный'), ('@default_class', 'растение'))

In [None]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:02,  7.59s/it][A
  8%|▊         | 2/25 [00:28<04:26, 11.59s/it][A
 12%|█▏        | 3/25 [00:36<03:49, 10.43s/it][A
 16%|█▌        | 4/25 [00:44<03:23,  9.67s/it][A
 20%|██        | 5/25 [00:53<03:11,  9.56s/it][A
 24%|██▍       | 6/25 [01:02<02:59,  9.42s/it][A
 28%|██▊       | 7/25 [01:10<02:40,  8.94s/it][A
 32%|███▏      | 8/25 [01:17<02:25,  8.55s/it][A
 36%|███▌      | 9/25 [01:25<02:13,  8.32s/it][A
 40%|████      | 10/25 [01:33<02:03,  8.21s/it][A
 44%|████▍     | 11/25 [01:41<01:52,  8.06s/it][A
 48%|████▊     | 12/25 [01:49<01:43,  7.96s/it][A
 52%|█████▏    | 13/25 [01:57<01:35,  7.93s/it][A
 56%|█████▌    | 14/25 [02:14<01:59, 10.87s/it][A
 60%|██████    | 15/25 [02:32<02:09, 12.99s/it][A
 64%|██████▍   | 16/25 [02:50<02:09, 14.40s/it][A
 68%|██████▊   | 17/25 [03:07<02:01, 15.20s/it][A
 72%|███████▏  | 18/25 [03:24<01:50, 15.83s/it][A
 76%|█████

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:24<00:00, 12.96s/it][A
  5%|▌         | 1/20 [05:36<1:46:27, 336.19s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:02,  7.61s/it][A
  8%|▊         | 2/25 [00:28<04:26, 11.57s/it][A
 12%|█▏        | 3/25 [00:35<03:48, 10.36s/it][A
 16%|█▌        | 4/25 [00:43<03:21,  9.58s/it][A
 20%|██        | 5/25 [00:51<03:00,  9.01s/it][A
 24%|██▍       | 6/25 [00:59<02:43,  8.59s/it][A
 28%|██▊       | 7/25 [01:06<02:30,  8.34s/it][A
 32%|███▏      | 8/25 [01:14<02:17,  8.12s/it][A
 36%|███▌      | 9/25 [01:21<02:07,  7.96s/it][A
 40%|████      | 10/25 [01:29<01:58,  7.90s/it][A
 44%|████▍     | 11/25 [01:37<01:49,  7.80s/it][A
 48%|████▊     | 12/25 [01:44<01:40,  7.75s/it][A
 52%|█████▏    | 13/25 [01:52<01:33,  7.75s/it][A
 56%|█████▌    | 14/25 [02:10<01:58, 10.79s/it][A
 60%|██████    | 15/25 [02:27<02:06, 12.62s/it][A
 64%|██████▍   | 16/25 [02:44<02:05, 13.99s/it][A
 68%|██████▊   | 17/25 [03:01<01:59, 14.97s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:19<00:00, 12.76s/it][A
 10%|█         | 2/20 [11:26<1:42:09, 340.53s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:04,  7.67s/it][A
  8%|▊         | 2/25 [00:28<04:27, 11.61s/it][A
 12%|█▏        | 3/25 [00:36<03:48, 10.40s/it][A
 16%|█▌        | 4/25 [00:43<03:21,  9.61s/it][A
 20%|██        | 5/25 [00:51<03:00,  9.05s/it][A
 24%|██▍       | 6/25 [00:59<02:43,  8.63s/it][A
 28%|██▊       | 7/25 [01:07<02:33,  8.51s/it][A
 32%|███▏      | 8/25 [01:15<02:20,  8.26s/it][A
 36%|███▌      | 9/25 [01:22<02:09,  8.10s/it][A
 40%|████      | 10/25 [01:30<01:59,  7.99s/it][A
 44%|████▍     | 11/25 [01:38<01:50,  7.90s/it][A
 48%|████▊     | 12/25 [01:45<01:41,  7.81s/it][A
 52%|█████▏    | 13/25 [01:53<01:33,  7.79s/it][A
 56%|█████▌    | 14/25 [02:10<01:57, 10.65s/it][A
 60%|██████    | 15/25 [02:28<02:05, 12.57s/it][A
 64%|██████▍   | 16/25 [02:45<02:06, 14.03s/it][A
 68%|██████▊   | 17/25 [03:02<01:59, 15.00s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:19<00:00, 12.78s/it][A
 15%|█▌        | 3/20 [17:17<1:37:18, 343.45s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:02,  7.59s/it][A
  8%|▊         | 2/25 [00:28<04:25, 11.54s/it][A
 12%|█▏        | 3/25 [00:35<03:47, 10.35s/it][A
 16%|█▌        | 4/25 [00:44<03:26,  9.84s/it][A
 20%|██        | 5/25 [00:54<03:14,  9.75s/it][A
 24%|██▍       | 6/25 [01:01<02:53,  9.11s/it][A
 28%|██▊       | 7/25 [01:09<02:37,  8.75s/it][A
 32%|███▏      | 8/25 [01:17<02:23,  8.41s/it][A
 36%|███▌      | 9/25 [01:24<02:10,  8.16s/it][A
 40%|████      | 10/25 [01:32<02:00,  8.03s/it][A
 44%|████▍     | 11/25 [01:40<01:50,  7.90s/it][A
 48%|████▊     | 12/25 [01:47<01:42,  7.85s/it][A
 52%|█████▏    | 13/25 [01:55<01:34,  7.88s/it][A
 56%|█████▌    | 14/25 [02:13<01:57, 10.70s/it][A
 60%|██████    | 15/25 [02:30<02:07, 12.78s/it][A
 64%|██████▍   | 16/25 [02:48<02:08, 14.32s/it][A
 68%|██████▊   | 17/25 [03:06<02:02, 15.36s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:26<00:00, 13.05s/it][A
 20%|██        | 4/20 [23:15<1:32:46, 347.92s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:09<03:50,  9.59s/it][A
  8%|▊         | 2/25 [00:30<05:00, 13.06s/it][A
 12%|█▏        | 3/25 [00:38<04:10, 11.40s/it][A
 16%|█▌        | 4/25 [00:46<03:36, 10.31s/it][A
 20%|██        | 5/25 [00:53<03:09,  9.48s/it][A
 24%|██▍       | 6/25 [01:01<02:49,  8.94s/it][A
 28%|██▊       | 7/25 [01:09<02:34,  8.59s/it][A
 32%|███▏      | 8/25 [01:17<02:25,  8.58s/it][A
 36%|███▌      | 9/25 [01:26<02:16,  8.53s/it][A
 40%|████      | 10/25 [01:33<02:05,  8.37s/it][A
 44%|████▍     | 11/25 [01:41<01:54,  8.20s/it][A
 48%|████▊     | 12/25 [01:49<01:44,  8.07s/it][A
 52%|█████▏    | 13/25 [01:58<01:40,  8.38s/it][A
 56%|█████▌    | 14/25 [02:16<02:03, 11.19s/it][A
 60%|██████    | 15/25 [02:34<02:11, 13.14s/it][A
 64%|██████▍   | 16/25 [02:51<02:10, 14.55s/it][A
 68%|██████▊   | 17/25 [03:09<02:03, 15.49s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:26<00:00, 13.07s/it][A
 25%|██▌       | 5/20 [29:13<1:27:42, 350.82s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:05,  7.72s/it][A
  8%|▊         | 2/25 [00:28<04:29, 11.70s/it][A
 12%|█▏        | 3/25 [00:36<03:50, 10.46s/it][A
 16%|█▌        | 4/25 [00:44<03:25,  9.77s/it][A
 20%|██        | 5/25 [00:51<03:01,  9.10s/it][A
 24%|██▍       | 6/25 [00:59<02:44,  8.65s/it][A
 28%|██▊       | 7/25 [01:07<02:31,  8.40s/it][A
 32%|███▏      | 8/25 [01:15<02:19,  8.23s/it][A
 36%|███▌      | 9/25 [01:23<02:13,  8.32s/it][A
 40%|████      | 10/25 [01:31<02:02,  8.14s/it][A
 44%|████▍     | 11/25 [01:40<01:56,  8.34s/it][A
 48%|████▊     | 12/25 [01:49<01:50,  8.53s/it][A
 52%|█████▏    | 13/25 [01:57<01:39,  8.31s/it][A
 56%|█████▌    | 14/25 [02:14<02:01, 11.03s/it][A
 60%|██████    | 15/25 [02:33<02:15, 13.57s/it][A
 64%|██████▍   | 16/25 [02:52<02:15, 15.07s/it][A
 68%|██████▊   | 17/25 [03:09<02:05, 15.69s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:28<00:00, 13.14s/it][A
 30%|███       | 6/20 [35:12<1:22:25, 353.28s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:01,  7.58s/it][A
  8%|▊         | 2/25 [00:28<04:28, 11.65s/it][A
 12%|█▏        | 3/25 [00:37<03:59, 10.89s/it][A
 16%|█▌        | 4/25 [00:47<03:38, 10.41s/it][A
 20%|██        | 5/25 [00:56<03:20, 10.02s/it][A
 24%|██▍       | 6/25 [01:05<03:05,  9.75s/it][A
 28%|██▊       | 7/25 [01:14<02:52,  9.61s/it][A
 32%|███▏      | 8/25 [01:23<02:40,  9.47s/it][A
 36%|███▌      | 9/25 [01:31<02:22,  8.90s/it][A
 40%|████      | 10/25 [01:39<02:08,  8.56s/it][A
 44%|████▍     | 11/25 [01:46<01:55,  8.27s/it][A
 48%|████▊     | 12/25 [01:54<01:44,  8.07s/it][A
 52%|█████▏    | 13/25 [02:02<01:35,  7.98s/it][A
 56%|█████▌    | 14/25 [02:19<01:58, 10.78s/it][A
 60%|██████    | 15/25 [02:36<02:07, 12.80s/it][A
 64%|██████▍   | 16/25 [02:55<02:10, 14.53s/it][A
 68%|██████▊   | 17/25 [03:14<02:06, 15.77s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:30<00:00, 13.21s/it][A
 35%|███▌      | 7/20 [41:12<1:17:01, 355.52s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:03,  7.64s/it][A
  8%|▊         | 2/25 [00:29<04:30, 11.76s/it][A
 12%|█▏        | 3/25 [00:38<04:01, 10.97s/it][A
 16%|█▌        | 4/25 [00:47<03:39, 10.46s/it][A
 20%|██        | 5/25 [00:55<03:13,  9.69s/it][A
 24%|██▍       | 6/25 [01:02<02:51,  9.05s/it][A
 28%|██▊       | 7/25 [01:12<02:44,  9.11s/it][A
 32%|███▏      | 8/25 [01:20<02:31,  8.89s/it][A
 36%|███▌      | 9/25 [01:28<02:17,  8.58s/it][A
 40%|████      | 10/25 [01:36<02:05,  8.36s/it][A
 44%|████▍     | 11/25 [01:43<01:54,  8.17s/it][A
 48%|████▊     | 12/25 [01:51<01:44,  8.02s/it][A
 52%|█████▏    | 13/25 [01:59<01:36,  8.01s/it][A
 56%|█████▌    | 14/25 [02:16<01:58, 10.79s/it][A
 60%|██████    | 15/25 [02:34<02:08, 12.88s/it][A
 64%|██████▍   | 16/25 [02:52<02:09, 14.40s/it][A
 68%|██████▊   | 17/25 [03:10<02:03, 15.39s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:28<00:00, 13.13s/it][A
 40%|████      | 8/20 [47:11<1:11:17, 356.44s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:01,  7.56s/it][A
  8%|▊         | 2/25 [00:28<04:25, 11.53s/it][A
 12%|█▏        | 3/25 [00:35<03:47, 10.34s/it][A
 16%|█▌        | 4/25 [00:43<03:20,  9.56s/it][A
 20%|██        | 5/25 [00:51<02:59,  8.95s/it][A
 24%|██▍       | 6/25 [00:59<02:43,  8.62s/it][A
 28%|██▊       | 7/25 [01:07<02:31,  8.44s/it][A
 32%|███▏      | 8/25 [01:14<02:19,  8.20s/it][A
 36%|███▌      | 9/25 [01:22<02:08,  8.02s/it][A
 40%|████      | 10/25 [01:30<01:59,  7.94s/it][A
 44%|████▍     | 11/25 [01:37<01:49,  7.83s/it][A
 48%|████▊     | 12/25 [01:45<01:40,  7.76s/it][A
 52%|█████▏    | 13/25 [01:52<01:33,  7.75s/it][A
 56%|█████▌    | 14/25 [02:09<01:55, 10.52s/it][A
 60%|██████    | 15/25 [02:27<02:04, 12.49s/it][A
 64%|██████▍   | 16/25 [02:45<02:07, 14.17s/it][A
 68%|██████▊   | 17/25 [03:02<02:01, 15.16s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:19<00:00, 12.77s/it][A
 45%|████▌     | 9/20 [53:01<1:04:59, 354.46s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:07,  7.82s/it][A
  8%|▊         | 2/25 [00:29<04:32, 11.86s/it][A
 12%|█▏        | 3/25 [00:36<03:54, 10.66s/it][A
 16%|█▌        | 4/25 [00:44<03:27,  9.87s/it][A
 20%|██        | 5/25 [00:52<03:04,  9.21s/it][A
 24%|██▍       | 6/25 [01:00<02:46,  8.77s/it][A
 28%|██▊       | 7/25 [01:08<02:32,  8.45s/it][A
 32%|███▏      | 8/25 [01:15<02:18,  8.17s/it][A
 36%|███▌      | 9/25 [01:23<02:08,  8.02s/it][A
 40%|████      | 10/25 [01:31<01:59,  7.98s/it][A
 44%|████▍     | 11/25 [01:38<01:50,  7.88s/it][A
 48%|████▊     | 12/25 [01:46<01:41,  7.79s/it][A
 52%|█████▏    | 13/25 [01:54<01:33,  7.78s/it][A
 56%|█████▌    | 14/25 [02:11<01:56, 10.57s/it][A
 60%|██████    | 15/25 [02:28<02:05, 12.53s/it][A
 64%|██████▍   | 16/25 [02:45<02:05, 13.96s/it][A
 68%|██████▊   | 17/25 [03:02<01:59, 14.91s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:19<00:00, 12.77s/it][A
 50%|█████     | 10/20 [58:51<58:52, 353.25s/it] 
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:01,  7.58s/it][A
  8%|▊         | 2/25 [00:28<04:26, 11.58s/it][A
 12%|█▏        | 3/25 [00:36<03:48, 10.40s/it][A
 16%|█▌        | 4/25 [00:43<03:21,  9.61s/it][A
 20%|██        | 5/25 [00:51<03:00,  9.01s/it][A
 24%|██▍       | 6/25 [00:59<02:43,  8.59s/it][A
 28%|██▊       | 7/25 [01:07<02:34,  8.60s/it][A
 32%|███▏      | 8/25 [01:16<02:28,  8.75s/it][A
 36%|███▌      | 9/25 [01:26<02:22,  8.88s/it][A
 40%|████      | 10/25 [01:35<02:14,  9.00s/it][A
 44%|████▍     | 11/25 [01:42<02:00,  8.57s/it][A
 48%|████▊     | 12/25 [01:50<01:48,  8.37s/it][A
 52%|█████▏    | 13/25 [02:00<01:43,  8.65s/it][A
 56%|█████▌    | 14/25 [02:18<02:08, 11.65s/it][A
 60%|██████    | 15/25 [02:37<02:16, 13.68s/it][A
 64%|██████▍   | 16/25 [02:56<02:19, 15.45s/it][A
 68%|██████▊   | 17/25 [03:14<02:08, 16.07s/it][

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:31<00:00, 13.28s/it][A
 55%|█████▌    | 11/20 [1:04:54<53:24, 356.08s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:01,  7.54s/it][A
  8%|▊         | 2/25 [00:28<04:26, 11.61s/it][A
 12%|█▏        | 3/25 [00:36<03:48, 10.39s/it][A
 16%|█▌        | 4/25 [00:43<03:21,  9.59s/it][A
 20%|██        | 5/25 [00:51<02:59,  8.98s/it][A
 24%|██▍       | 6/25 [00:58<02:42,  8.54s/it][A
 28%|██▊       | 7/25 [01:06<02:29,  8.28s/it][A
 32%|███▏      | 8/25 [01:14<02:17,  8.08s/it][A
 36%|███▌      | 9/25 [01:21<02:06,  7.92s/it][A
 40%|████      | 10/25 [01:29<01:57,  7.86s/it][A
 44%|████▍     | 11/25 [01:37<01:50,  7.92s/it][A
 48%|████▊     | 12/25 [01:45<01:43,  7.96s/it][A
 52%|█████▏    | 13/25 [01:54<01:37,  8.13s/it][A
 56%|█████▌    | 14/25 [02:11<02:00, 10.91s/it][A
 60%|██████    | 15/25 [02:29<02:08, 12.87s/it][A
 64%|██████▍   | 16/25 [02:46<02:08, 14.33s/it][A
 68%|██████▊   | 17/25 [03:03<02:01, 15.14s/it]

  'The parameter `documents` is not used by SimpleTopTokensCoherenceScore'



100%|██████████| 25/25 [05:22<00:00, 12.91s/it][A
 60%|██████    | 12/20 [1:10:47<47:20, 355.08s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:07<03:04,  7.70s/it][A
  8%|▊         | 2/25 [00:29<04:31, 11.79s/it][A
 12%|█▏        | 3/25 [00:36<03:51, 10.50s/it][A
 16%|█▌        | 4/25 [00:44<03:22,  9.65s/it][A
 20%|██        | 5/25 [00:52<03:02,  9.13s/it][A
 24%|██▍       | 6/25 [01:00<02:46,  8.76s/it][A
 28%|██▊       | 7/25 [01:07<02:31,  8.43s/it][A
 32%|███▏      | 8/25 [01:15<02:20,  8.24s/it][A
 36%|███▌      | 9/25 [01:23<02:09,  8.10s/it][A
 40%|████      | 10/25 [01:31<02:01,  8.07s/it][A
 44%|████▍     | 11/25 [01:39<01:51,  7.98s/it][A
 48%|████▊     | 12/25 [01:46<01:42,  7.87s/it][A
 52%|█████▏    | 13/25 [01:54<01:33,  7.83s/it][A
 56%|█████▌    | 14/25 [02:11<01:56, 10.60s/it][A
 60%|██████    | 15/25 [02:28<02:05, 12.56s/it][A
 64%|██████▍   | 16/25 [02:45<02:06, 14.00s/it][A
 68%|██████▊   | 17/25 [03:04<02:01, 15.22s/it]

In [59]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [02:18<55:24, 138.52s/it][A
  8%|▊         | 2/25 [08:28<1:19:45, 208.09s/it][A
 12%|█▏        | 3/25 [10:46<1:08:31, 186.87s/it][A
 16%|█▌        | 4/25 [13:03<1:00:11, 171.96s/it][A
 20%|██        | 5/25 [15:21<53:58, 161.93s/it]  [A
 24%|██▍       | 6/25 [17:38<48:54, 154.45s/it][A
 28%|██▊       | 7/25 [19:55<44:44, 149.13s/it][A
 32%|███▏      | 8/25 [22:13<41:15, 145.60s/it][A
 36%|███▌      | 9/25 [24:34<38:29, 144.35s/it][A
 40%|████      | 10/25 [27:20<37:42, 150.81s/it][A
 44%|████▍     | 11/25 [29:39<34:23, 147.37s/it][A
 48%|████▊     | 12/25 [31:56<31:15, 144.25s/it][A
 52%|█████▏    | 13/25 [34:14<28:27, 142.25s/it][A
 56%|█████▌    | 14/25 [39:27<35:30, 193.64s/it][A
 60%|██████    | 15/25 [44:44<38:26, 230.69s/it][A
 64%|██████▍   | 16/25 [49:59<38:22, 255.84s/it][A
 68%|██████▊   | 17/25 [55:13<36:26, 273.28s/it][A
 72%|███████▏  | 18/25 [1:00:28<33:1

NotImplementedError: 

In [None]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [02:22<56:54, 142.27s/it][A
  8%|▊         | 2/25 [08:47<1:22:29, 215.21s/it][A
 12%|█▏        | 3/25 [11:07<1:10:38, 192.66s/it][A
 16%|█▌        | 4/25 [13:27<1:01:55, 176.91s/it][A
 20%|██        | 5/25 [15:48<55:22, 166.11s/it]  [A
 24%|██▍       | 6/25 [18:09<50:12, 158.54s/it][A
 28%|██▊       | 7/25 [20:30<46:00, 153.38s/it][A
 32%|███▏      | 8/25 [22:51<42:20, 149.46s/it][A
 36%|███▌      | 9/25 [25:12<39:11, 146.96s/it][A
 40%|████      | 10/25 [27:32<36:13, 144.89s/it][A
 44%|████▍     | 11/25 [29:53<33:30, 143.58s/it][A
 48%|████▊     | 12/25 [32:14<30:58, 142.96s/it][A
 52%|█████▏    | 13/25 [34:35<28:26, 142.21s/it][A
 56%|█████▌    | 14/25 [39:55<35:52, 195.69s/it][A
 60%|██████    | 15/25 [45:15<38:51, 233.13s/it][A
 64%|██████▍   | 16/25 [50:36<38:55, 259.46s/it][A
 68%|██████▊   | 17/25 [56:05<37:21, 280.25s/it][A
 72%|███████▏  | 18/25 [1:01:25<34:0

In [185]:
%%time

optimizer.search_for_optimum(dataset)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/32 [00:00<?, ?it/s][A
  3%|▎         | 1/32 [01:58<1:01:22, 118.79s/it][A
  6%|▋         | 2/32 [07:21<1:29:55, 179.86s/it][A
  9%|▉         | 3/32 [09:22<1:18:30, 162.44s/it][A
 12%|█▎        | 4/32 [11:21<1:09:39, 149.25s/it][A
 16%|█▌        | 5/32 [13:20<1:03:05, 140.20s/it][A
 19%|█▉        | 6/32 [15:19<57:57, 133.74s/it]  [A
 22%|██▏       | 7/32 [17:17<53:50, 129.22s/it][A
 25%|██▌       | 8/32 [19:16<50:22, 125.94s/it][A
 28%|██▊       | 9/32 [21:14<47:26, 123.76s/it][A
 31%|███▏      | 10/32 [23:13<44:49, 122.24s/it][A
 34%|███▍      | 11/32 [25:11<42:20, 120.96s/it][A
 38%|███▊      | 12/32 [27:09<39:58, 119.93s/it][A
 41%|████      | 13/32 [29:07<37:49, 119.45s/it][A
 44%|████▍     | 14/32 [33:29<48:43, 162.41s/it][A
 47%|████▋     | 15/32 [37:50<54:23, 191.95s/it][A
 50%|█████     | 16/32 [42:18<57:12, 214.56s/it][A
 53%|█████▎    | 17/32 [46:42<57:22, 229.50s/it][A
 56%|█████▋    | 18/32 [51:06<55

KeyboardInterrupt: 

In [None]:
! ls $optimizer._topic_bank._path

model_0__phi.bin	   model_4__phi.bin	      model_8__phi.bin
model_0__topic_scores.bin  model_4__topic_scores.bin  model_8__topic_scores.bin
model_1__phi.bin	   model_5__phi.bin	      model_9__phi.bin
model_1__topic_scores.bin  model_5__topic_scores.bin  model_9__topic_scores.bin
model_2__phi.bin	   model_6__phi.bin	      topics.bin
model_2__topic_scores.bin  model_6__topic_scores.bin  topic_scores.bin
model_3__phi.bin	   model_7__phi.bin
model_3__topic_scores.bin  model_7__topic_scores.bin


## Appendix. Making vocab.txt for Computing Coocs

In [182]:
! rm $DATASET_INTERNALS_FOLDER_PATH/dict.dict

In [153]:
dictionary = dataset.get_dictionary()

In [154]:
dictionary

artm.Dictionary(name=19a09c5f-e84a-43e7-bbe1-de8dde814c40, num_entries=110587)

In [155]:
dataset._data.shape

(20291, 3)

In [156]:
# PN: dictionary.filter(min_df_rate=0.04, max_df_rate=0.5)
# Reuters: dictionary.filter(min_df_rate=0.01, max_df_rate=0.5)
# Brown: dictionary.filter(min_df_rate=0.02, max_df_rate=0.5)
# 20 NG: dictionary.filter(min_df_rate=0.01, max_df_rate=0.5)

# AG dictionary.filter(min_df_rate=0.002, max_df_rate=0.5)

# Watan2004
dictionary.filter(min_df_rate=0.01, max_df_rate=0.5)

print(dictionary)

artm.Dictionary(name=19a09c5f-e84a-43e7-bbe1-de8dde814c40, num_entries=4062)


In [157]:
dictionary_file_path = os.path.join(
    DATASET_INTERNALS_FOLDER_PATH,
    'dict.dict',
)

dictionary.save_text(dictionary_file_path)

In [158]:
dictionary

artm.Dictionary(name=19a09c5f-e84a-43e7-bbe1-de8dde814c40, num_entries=4062)

In [159]:
lines = open(dictionary_file_path, 'r').readlines()

In [160]:
len(lines)

4064

In [161]:
lines[:3]

['name: 19a09c5f-e84a-43e7-bbe1-de8dde814c40 num_items: 20291\n',
 'token, class_id, token_value, token_tf, token_df\n',
 'هرير, @word, 6.949519593035802e-05, 593.0, 417.0\n']

In [162]:
lines[-3:]

['معالي, @word, 0.0005515082739293575, 4706.0, 1598.0\n',
 'طريق, @word, 0.0010923519730567932, 9321.0, 4924.0\n',
 'سكن, @word, 6.129171379143372e-05, 523.0, 420.0\n']

In [163]:
vocab_text = ''

for line in lines[2:]:
    token, modality, _, _, _ = line.strip().split(', ')
    vocab_text += f'{token} {modality}\n'

In [164]:
vocab_file_path = os.path.join(
    DATASET_INTERNALS_FOLDER_PATH,
    'vocab.txt',
)

In [165]:
with open(vocab_file_path, 'w') as f:
    f.write(vocab_text)

In [None]:
! ~/Workspace/bigartm/build/bin/bigartm \
    -c vw.txt \
    -v vocab.txt \
    --cooc-window 10 \
    --cooc-min-tf 2 \
    --write-cooc-tf cooc_tf_ \
    --cooc-min-df 2 \
    --write-cooc-df cooc_df_ \
    --write-ppmi-tf ppmi_tf_ \
    --write-ppmi-df ppmi_df_

In [166]:
def transform_coocs_file(source_file_path, target_file_path, vocab_file_path):
    """
    source_file is assumed to be either ppmi_tf_ or ppmi_df_ 
    """
    num_times_to_log = 10

    vocab = open(vocab_file_path, 'r').readlines()
    vocab = [l.strip().split()[0] for l in vocab]
    
    cooc_values = dict()
    word_word_value_triples = set()
    
    lines = open(source_file_path, 'r').readlines()
    
    for i, l in enumerate(lines):
        if i % (len(lines) // num_times_to_log) == 0:
            print(f'{i:6d} lines out of {len(lines)}')
        
        l = l.strip()
        words = l.split()
        words = words[1:]  # exclude modality
        anchor_word = words[0]
        
        other_word_values = words[1:]
        
        for word_and_value in other_word_values:
            other_word, value = word_and_value.split(':')
            value = float(value)
            
            cooc_values[(anchor_word, other_word)] = value
            cooc_values[(other_word, anchor_word)] = value  # if assume cooc values to be symmetric
            
            word_word_value_triples.add(
                tuple([
                    tuple(sorted([
                        vocab.index(anchor_word),
                        vocab.index(other_word)
                    ])),
                    value
                ])
            )
    
    new_text = ''
    
    for (w1, w2), v in word_word_value_triples:
        new_text += f'{w1} {w2} {v}\n'
    
    with open(target_file_path, 'w') as f:
        f.write(''.join(new_text))
    
    return cooc_values

In [167]:
def prefix_path(p):
    return os.path.join(DATASET_INTERNALS_FOLDER_PATH, p)

In [168]:
COOC_VALUES = transform_coocs_file(
    prefix_path('ppmi_tf_'),
    prefix_path('new_ppmi_tf_'),
    prefix_path('vocab.txt')
)

     0 lines out of 4058
   405 lines out of 4058
   810 lines out of 4058
  1215 lines out of 4058
  1620 lines out of 4058
  2025 lines out of 4058
  2430 lines out of 4058
  2835 lines out of 4058
  3240 lines out of 4058
  3645 lines out of 4058
  4050 lines out of 4058


In [169]:
with open(prefix_path('cooc_values.json'), 'w') as f:
    f.write(json.dumps(list(COOC_VALUES.items())))

In [320]:
saved_raw_cooc_values = json.loads(open(prefix_path('cooc_values.json'), 'r').read())

saved_cooc_values = {
    tuple(d[0]): d[1] for d in saved_raw_cooc_values
}

In [321]:
COOC_VALUES = saved_cooc_values

In [170]:
list(COOC_VALUES.items())[:20]

[(('هرير', 'تمني'), 1.82768),
 (('تمني', 'هرير'), 1.82768),
 (('هرير', 'كبر'), 0.581746),
 (('كبر', 'هرير'), 0.581746),
 (('هرير', 'داود'), 2.47035),
 (('داود', 'هرير'), 2.47035),
 (('هرير', 'تلق'), 2.12148),
 (('تلق', 'هرير'), 2.12148),
 (('هرير', 'بدو'), 0.443265),
 (('بدو', 'هرير'), 0.443265),
 (('هرير', 'سعيد'), 0.706545),
 (('سعيد', 'هرير'), 0.706545),
 (('هرير', 'ملائك'), 2.00346),
 (('ملائك', 'هرير'), 2.00346),
 (('هرير', 'اني'), 1.07549),
 (('اني', 'هرير'), 1.07549),
 (('هرير', 'تفتيش'), 2.54284),
 (('تفتيش', 'هرير'), 2.54284),
 (('هرير', 'مر'), 1.10353),
 (('مر', 'هرير'), 1.10353)]

In [171]:
len(COOC_VALUES)

3514220

In [148]:
len(COOC_VALUES)

551060