In [1]:
from data import * 
from lda_trial import *

from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
trainset = IMDBDataset('train', data_limit=20_000)
validset = IMDBDataset('valid')

In [None]:
n = 20000

for k in [2]:
    print(f'{k}.')
    trained_model, final_metric = tp_one_trial(trainset, 'lda', k, n, 
                                              3, 5,  # args.burn_in,
                                              max_iter=20, stop_increase=5, metric='ll')

    lda_x, lda_y = load_LDA_data_batch(trained_model, trainset)
    model = LinearSVC()
    model.fit(lda_x, lda_y)

    prediction = model.predict(lda_x)
    ground_truth = lda_y
    print(f'Train: {100*accuracy_score(prediction, ground_truth):6.5f}%')
    lda_x, lda_y = load_LDA_data_batch(trained_model, validset)

    prediction = model.predict(lda_x)
    ground_truth = lda_y
    print(f'Test: {100*accuracy_score(prediction, ground_truth):6.5f}%')
    print('-' * 100)

In [6]:
import argparse, os, time, pickle

import tomotopy as tp
import numpy as np
import matplotlib.pyplot as plt

from data import *
from util import *


def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5,
             max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'):
    assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...'
    assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...'
    if model_type == 'lda':
        model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'ctm':
        model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == "slda":
        model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'hdp':
        model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    sample_size = min(sample_size, len(dataset))
    
    max_iter = max_iter * sample_size * topic_size // 2000  # ensure the number of iterations increases with the size of sample
    model.burn_in = max_iter // 5  # set burn-in: 20 percent of max iterations

    for i in range(sample_size):
        doc, label = dataset[i]
        if model_type == "slda":
            model.add_doc(doc,[float(label),])
        else:
            model.add_doc(doc)

    if min_iter is None:
        min_iter = max_iter // 5
    if checkpoint is None:
        checkpoint = max_iter // 5

    model.train(min_iter)

    pre_metric = - np.infty
    stop_increase_cnt = 0.
    cur_metric = 0.
    for i in range(1, max_iter+1):
        model.train(1)
        # Metric is always larger, better
        if metric == 'll':
            cur_metric += model.ll_per_word
        if metric == 'pp':
            cur_metric += - model.perplexity  # smaller perplexity is better.

        if i % checkpoint == 0:
            cur_metric /= checkpoint
            print(f'Current loss: {cur_metric:.5f}')
            if cur_metric >= pre_metric:
                pre_metric = cur_metric
            else:
                stop_increase_cnt += 1
            cur_metric = 0.

        if stop_increase_cnt >= stop_increase:
            break

    final_metric = model.perplexity if metric == 'pp' else model.ll_per_word

    print(f'Trial iterations: {i + min_iter}.')
    return model, final_metric

In [14]:
n = 5000
k = 2
trained_model, final_metric = tp_one_trial(trainset, 'hdp', k, n, 
                                              3, 5,  # args.burn_in,
                                              max_iter=20, stop_increase=5, metric='ll')
print(trained_model.live_k)

Current loss: -7.57064
Current loss: -7.55045
Current loss: -7.53612
Current loss: -7.52850
Current loss: -7.52232
Trial iterations: 120.
10


In [12]:
n = 5000
k = 3
trained_model, final_metric = tp_one_trial(trainset, 'hdp', k, n, 
                                              3, 5,  # args.burn_in,
                                              max_iter=20, stop_increase=5, metric='ll')
print(trained_model.live_k)

Current loss: -7.64659
Current loss: -7.62968
Current loss: -7.61834
Current loss: -7.61085
Current loss: -7.60513
Trial iterations: 180.
18


In [13]:
n = 5000
k = 5
trained_model, final_metric = tp_one_trial(trainset, 'hdp', k, n, 
                                              3, 5,  # args.burn_in,
                                              max_iter=20, stop_increase=5, metric='ll')
print(trained_model.live_k)

Current loss: -7.58315
Current loss: -7.54795
Current loss: -7.53424
Current loss: -7.52699
Current loss: -7.52207
Trial iterations: 300.
16


In [None]:
n = 5000
k = 10
trained_model, final_metric = tp_one_trial(trainset, 'hdp', k, n, 
                                              3, 5,  # args.burn_in,
                                              max_iter=20, stop_increase=5, metric='ll')
print(trained_model.live_k)

Current loss: -7.55773
Current loss: -7.54378
Current loss: -7.53730
