# NumTopics: PScience

In [1]:
import time

from topicnet.cooking_machine.models import TopicModel
from topicnet.cooking_machine.model_constructor import init_simple_default_model
from topicnet.cooking_machine.dataset import Dataset

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import sys

sys.path.insert(0, '..')

from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection
from topnum.search_methods.optimize_scores_method import OptimizeScoresMethod


## Data

In [4]:
PATH = './wiki_ref220_vw.txt'
BATCHES = './WR2_batches'


In [5]:
PATH = '/data_mil/datasets/Post_Science/PScience.csv'
BATCHES = '/home/vbulatov/Projects/OptimalNumberOfTopics/demos/PS_batches'

main_modality_name = "@word"
modalities = {"@word": 1}

dataset = Dataset(PATH, batch_vectorizer_path=BATCHES)


  'Parameter name `batch_vectorizer_path` is obsolete,'


In [6]:
VW_PATH =  '/home/vbulatov/Projects/OptimalNumberOfTopics/demos/PS_vw.txt'
dataset.write_vw(VW_PATH)

In [7]:
text_collection = VowpalWabbitTextCollection(
    file_path=VW_PATH,
    main_modality=main_modality_name,
    modalities=modalities
)

## Experiments

In [8]:

min_num_topics = 5
max_num_topics = 30

num_topics_interval = 1
num_fit_iterations = 30
num_restarts = 3

In [9]:
from topnum.scores import (
    SpectralDivergenceScore, CalinskiHarabaszScore, DiversityScore, EntropyScore,
    HoldoutPerplexityScore, IntratextCoherenceScore, 
    LikelihoodBasedScore, PerplexityScore, SilhouetteScore,
    SparsityPhiScore, SparsityThetaScore,
    
    SimpleTopTokensCoherenceScore, SophisticatedTopTokensCoherenceScore
)

In [10]:


def build_every_score(dataset):
    scores = [
        SpectralDivergenceScore("arun", dataset, [main_modality_name]),
        PerplexityScore("perp"),
        SparsityPhiScore("sparsity_phi"), SparsityThetaScore("sparsity_theta")
    ]
    
    # TODO: actually perform a train-test split here
    # HoldoutPerplexityScore("holdout_perp", dataset),  
    
    # TODO: and this
    '''
        coherences = [
        IntratextCoherenceScore("intracoh", dataset),
        SimpleTopTokensCoherenceScore(),
        SophisticatedTopTokensCoherenceScore
    ]

    '''
    
    likelihoods = [
        LikelihoodBasedScore(
            f"{mode}_sparsity_{flag}", validation_dataset=dataset, modality=main_modality_name,
            mode=mode, consider_sparsity=flag
        )
        for mode in ["AIC", "BIC", "MDL"] for flag in [True, False]
    ]
    
    renyi_variations = [
        EntropyScore(f"renyi_{threshold_factor}", threshold_factor=threshold_factor)
        for threshold_factor in [0.5, 1, 2]
    ]
    clustering = [
        CalinskiHarabaszScore("calhar", dataset), SilhouetteScore("silh", dataset)
    ]
    diversity = [
        DiversityScore(f"diversity_{metric}", metric=metric)
        for metric in ["euclidean", 'jensenshannon', 'cosine', 'hellinger']
    ]
    
    return scores + diversity + clustering + renyi_variations + likelihoods

In [11]:
from topnum.model_constructor import KNOWN_MODELS
import warnings

KNOWN_MODELS

['LDA', 'PLSA', 'sparse', 'decorrelation', 'ARTM']

In [12]:
warnings.filterwarnings("ignore", category=UserWarning)

for model_family in KNOWN_MODELS:

    optimizer = OptimizeScoresMethod(
        scores=build_every_score(dataset),
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        num_topics_interval=num_topics_interval,
        num_fit_iterations=num_fit_iterations,
        num_restarts=num_restarts,
        model_family=model_family,
        experiment_name="num_topics_" + model_family,
        experiment_directory="PScience_experiments"
    )
    t_start = time.time()

    optimizer.search_for_optimum(text_collection)
    t_end = time.time()
    print(model_family, (t_end - t_start) / 60)


100%|██████████| 3/3 [41:40<00:00, 833.40s/it]


AttributeError: '_SpectralDivergenceScore' object has no attribute '_dataset_file_path'

In [None]:
%load_ext autoreload
%autoreload 2

from topnum.search_methods.optimize_scores_method import load_models_from_disk



In [None]:
result, detailed_resut = load_models_from_disk('PScience_experiments', 'num_topics_search_every_score_PLSA')

In [None]:
plt.plot(detailed_resut['calhar'].T)
plt.show()

In [None]:
plt.plot(detailed_resut['arun'].T)
plt.show()

In [None]:
plt.plot(detailed_resut['renyi_1'].T)
plt.show()

In [None]:
plt.plot(detailed_resut['diversity_cosine'].T)
plt.show()

In [None]:
plt.plot(detailed_resut['diversity_euclidean'].T)
plt.show()