In [1]:
import os
import os.path
import pickle
import time
import shelve

import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np

from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec import LDA2Vec

In [2]:
gpu_id = int(os.getenv('CUDA_GPU', 0))
cuda.get_device(gpu_id).use()
print("Using GPU:" + str(gpu_id))

Using GPU:0


In [3]:
#data_dir = os.getenv('data_dir', '../data/')
fn_vocab = 'vocab.pkl'
fn_corpus = 'corpus.pkl'
fn_flatnd = 'flattened.npy'
fn_docids = 'doc_ids.npy'
fn_vectors = 'vectors.npy'
vocab = pickle.load(open(fn_vocab, 'rb'))
corpus = pickle.load(open(fn_corpus, 'rb'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)

In [4]:
# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 20))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

In [26]:
for key in sorted(locals().keys()):
    val = locals()[key]
    if len(str(val)) < 100 and '<' not in str(val):
        print(key, val)

_ 11009
_25 11009
__doc__ Automatically created module for IPython interactive environment
__loader__ None
__name__ __main__
__package__ None
__spec__ None
_dh ['/home/xenomorph/projects/onlps/lda2vec/notebooks']
_i len(model.mixture.weights.W)
_i13 serializers.load_npz('my.model', model)
_i14 serializers.load_npz('lda2vec.hdf5', model)
_i15 serializers.load_npz('lda2vec', model)
_i16 serializers.load_hdf5("lda2vec.hdf5")
_i17 serializers.load_hdf5("lda2vec.hdf5"), model)
_i18 serializers.load_hdf5("lda2vec.hdf5", model)
_i19 model
_i20 model
_i21 import pickle
_i22 with open("lda2vec.pkl", "w"):
    pickle.dump(model)
_i23 with open("lda2vec.pkl", "w") as f:
    pickle.dump(model, f)
_i24 with open("lda2vec.pkl", "wb") as f:
    pickle.dump(model, f)
_i25 len(model.mixture.weights.W)
_ii with open("lda2vec.pkl", "wb") as f:
    pickle.dump(model, f)
_iii with open("lda2vec.pkl", "w") as f:
    pickle.dump(model, f)
batchsize 4096
clambda 200.0
d [6535 6535 6535 ... 6535 6535 6535]
doc

# training the model

In [7]:
model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=15, power=power, temperature=temperature)

In [8]:
if os.path.exists('lda2vec.hdf5'):
    print("Reloading from saved")
    serializers.load_hdf5("lda2vec.hdf5", model)
    
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]

In [9]:
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

In [10]:
j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')

In [11]:
for epoch in range(1):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    print(epoch)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        model.cleargrads()
        #optimizer.use_cleargrads(use=False)
        l = model.fit_partial(d.copy(), f.copy())
        print("after partial fitting:", l)
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
                    prior=float(prior.data), rate=rate)
        print(msg.format(**logs))
        j += 1
    serializers.save_hdf5("lda2vec.hdf5", model)

Top words in topic 0 galileo easier codes differences professor worse better van calculations complicated
Top words in topic 1 gene dept subscribe nut puck altitude atlanta protein seed login
Top words in topic 2 md exec languages consumer d.c. sf sensitive centris dir commands
Top words in topic 3 wiretap politically 2d armenians politics di al conflicts differences political
Top words in topic 4 puck shareware tyre maxtor header lens der responsibility visit ball
Top words in topic 5 ss isaiah rf apologize mental v. skills arithmetic wolverine situation
Top words in topic 6 criticism koresh intelligence demands replies skepticism theists spiritual teacher atheist
Top words in topic 7 l. transmitted transmit widget expos pilot directory rider jim boost
Top words in topic 8 hung england florida punishment lying california practice 96 baptism arizona
Top words in topic 9 denning glory smokeless netters patches jim ban michael cubs alt
Top words in topic 10 login documentary jury alomar 