# Model Training - 20 news dataset

## Importing libraries

In [1]:
import pickle
from joblib import dump
from gensim import corpora, models

## Load training data

In [2]:
preprocessed_text_path = "preprocessed_text/test_data7532"

# Load the array back
with open(f"{preprocessed_text_path}.pkl", 'rb') as f:
  preprocessed_docs = pickle.load(f)

In [3]:
author2doc_path = "preprocessed_text/author2doc-train"

with open(f"{author2doc_path}.pkl", 'rb') as f:
  author2doc = pickle.load(f)

## Create Dictionary and Corpus
Create a dictionary and Bag of Words (BoW) representation for the corpus.

In [4]:
dictionary = corpora.Dictionary(preprocessed_docs)

In [5]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [6]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [7]:
tfidf = models.TfidfModel(bow_corpus)

In [8]:
corpus_tfidf = tfidf[bow_corpus]

## Train Model

In [9]:
from pathlib import Path
import os
import sys

In [10]:
# Adjust the number of parent calls based on the nesting level
root_path = str(Path(os.getcwd()).resolve().parent)  
sys.path.append(root_path)

In [11]:
import model

In [12]:
# Initialize alpha, beta, a, and b if different from the defaults
alpha_init = 0.1
beta_init = 0.1
a_init = 0.1
b_init = 0.1

num_topics = 20

In [13]:
# Instantiate the ABLIMA model
trained_model = model.ABLIMA(
    corpus=corpus_tfidf,
    num_topics=num_topics,
    id2word=dictionary,
    authors=author2doc,
    alpha_init=alpha_init,
    beta_init=beta_init,
    a_init=a_init,
    b_init=b_init
)

In [14]:
# Run Gibbs sampling for training
trained_model.gibbs_sampling(iterations=200)

iteration: 1
Time : 226.7683 seconds 

iteration: 2
Time : 148.6863 seconds 

iteration: 3
Time : 144.0710 seconds 

iteration: 4
Time : 147.3567 seconds 

iteration: 5
Time : 149.2424 seconds 

iteration: 6
Time : 146.4978 seconds 

iteration: 7
Time : 145.8733 seconds 

iteration: 8
Time : 144.5982 seconds 

iteration: 9
Time : 145.6026 seconds 

iteration: 10
Time : 144.6232 seconds 

iteration: 11
Time : 145.3878 seconds 

iteration: 12
Time : 143.9943 seconds 

iteration: 13
Time : 143.5315 seconds 

iteration: 14
Time : 143.3523 seconds 

iteration: 15
Time : 144.3344 seconds 

iteration: 16
Time : 144.5447 seconds 

iteration: 17
Time : 144.5858 seconds 

iteration: 18
Time : 143.8310 seconds 

iteration: 19
Time : 143.5389 seconds 

iteration: 20
Time : 143.9356 seconds 

iteration: 21
Time : 143.6863 seconds 

iteration: 22
Time : 143.8888 seconds 

iteration: 23
Time : 144.9950 seconds 

iteration: 24
Time : 144.7465 seconds 

iteration: 25
Time : 143.8804 seconds 

iteration

## Store model

In [15]:
model_file_name = "ablima_model_200_iteration"

In [16]:
with open(f"trained_model/{model_file_name}.pkl", 'wb') as f:
    pickle.dump(trained_model, f)

In [17]:
dump(trained_model, f"trained_model/{model_file_name}.joblib")

['trained_model/ablima_model_200_iteration.joblib']