In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# display logging.info
import logging
logging.basicConfig(level=logging.INFO, force=True)

# Preprocessor

In [2]:
from lqrz.bayesian_mixture_model_pos_tagger.get_data.preprocessor import Preprocessor

In [3]:
file_path: str = '../../data/raw/*.txt'
preprocessor = Preprocessor()
preprocessor.preprocess(file_path=file_path)

In [4]:
path_output_preprocessor: str = 'outputs/preprocessor'
preprocessor.save_outputs(output_path=path_output_preprocessor)

# Trainer

In [5]:
from lqrz.bayesian_mixture_model_pos_tagger.train.gibbs_sampler import GibbsSampler
import joblib

## Instantiate

In [6]:
path_wordtype_counts_left = f'{path_output_preprocessor}/x_wordtype_counts_left.joblib'
path_wordtype_counts_right = f'{path_output_preprocessor}/x_wordtype_counts_right.joblib'
n_classes = 10

sampler = GibbsSampler.instantiate(
    path_wordtype_counts_left=path_wordtype_counts_left,
    path_wordtype_counts_right=path_wordtype_counts_right,
    n_classes=n_classes,
)

sampler

<lqrz.bayesian_mixture_model_pos_tagger.train.gibbs_sampler.GibbsSampler at 0x103c6f340>

## Run

In [7]:
n_iterations = 3
n_burn_in = 1
n_thinning = 1
alpha = .1
beta_left, beta_right = .5, .5

_ = sampler.run(
    n_iterations=n_iterations,
    alpha=alpha,
    beta_left=beta_left,
    beta_right=beta_right,
    n_burn_in=n_burn_in,
    n_thinning=n_thinning,
)

INFO:root:Iteration: 0 log_prob: -3813529.59
INFO:root:Iteration: 1 log_prob: -3759045.23
INFO:root:Iteration: 2 log_prob: -3740906.53


In [11]:
sampler.compute_word_type_posterior_entropy() # (n_wordtypes,)

array([2.48679190e-10, 2.48679190e-10, 2.48679190e-10, ...,
       6.93147181e-01, 6.93147181e-01, 6.93147181e-01])

In [8]:
path_output_train: str = 'outputs/train'
_ = sampler.save_outputs(output_path=path_output_train)