In [4]:
import pandas as pd
import numpy as np
import re
from scipy.special import psi  # gamma function utils

In [5]:
# Sample data for analysis
d1 = "Java is a language for programming that develops a software for several platforms. A compiled code or bytecode on Java application can run on most of the operating systems including Linux, Mac operating system, and Linux. Most of the syntax of Java is derived from the C++ and C languages."
d2 = "Python supports multiple programming paradigms and comes up with a large standard library, paradigms included are object-oriented, imperative, functional and procedural."
d3 = "Go is typed statically compiled language. It was created by Robert Griesemer, Ken Thompson, and Rob Pike in 2009. This language offers garbage collection, concurrency of CSP-style, memory safety, and structural typing."
d4 = "A young girl when she first visited magical Underland, Alice Kingsleigh (Mia Wasikowska) is now a teenager with no memory of the place -- except in her dreams."
d5 = "Her life takes a turn for the unexpected when, at a garden party for her fiance and herself, she spots a certain white rabbit and tumbles down a hole after him. Reunited with her friends the Mad Hatter (Johnny Depp), the Cheshire Cat and others, Alice learns it is her destiny to end the Red Queen's (Helena Bonham Carter) reign of terror."


In [6]:
## Utils and Helper Class

def tf(docs):
    """
    This function is used to calculate the document-term matrix and id2word mapping
    """
    # Clean up the text
    docsc_clean = {}
    total_term = []
    for key, val in enumerate(docs):
        val_clean = re.findall(r'[a-z]+', val.lower())
        docsc_clean[f'd{key}'] = val_clean
        total_term += val_clean

    total_term_unique = sorted(set(total_term))
    id2word = {idx: word for  idx, word in enumerate(total_term_unique)}

    # Count the number of occurrences of term i in document j
    for key, val in docsc_clean.items():
        word_dir = dict.fromkeys(total_term_unique, 0)
        for word in val:
            word_dir[word] += 1
        docsc_clean[key] = word_dir

    tf_df = pd.DataFrame.from_dict(docsc_clean, orient='index')

    return tf_df, id2word

def dirichlet_expectation(sstats):
    if len(sstats.shape) == 1:
        return psi(sstats) - psi(np.sum(sstats))
    else:
        return psi(sstats) - psi(np.sum(sstats, 1))[:, np.newaxis]

class LdaState:
    def __init__(self, eta, shape, dtype=np.float32):
        """
        Parameters
        ----------
        eta : numpy.ndarray
            The prior probabilities assigned to each term.
        shape : tuple of (int, int)
            Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
        dtype : type
            Overrides the numpy array default types.

        """
        self.eta = eta.astype(dtype, copy=False)
        self.sstats = np.zeros(shape, dtype=dtype)
        self.numdocs = 0
        self.dtype = dtype

    def get_lambda(self):
        """Get the parameters of the posterior over the topics, also referred to as "the topics".

        Returns
        -------
        numpy.ndarray
            Parameters of the posterior probability over topics.

        """
        return self.eta + self.sstats

    def get_Elogbeta(self):
        """Get the log (posterior) probabilities for each topic.

        Returns
        -------
        numpy.ndarray
            Posterior probabilities for each topic.
        """
        return dirichlet_expectation(self.get_lambda())

    def blend(self, rhot, other, targetsize=None):
        """Merge the current state with another one using a weighted average for the sufficient statistics.

        The number of documents is stretched in both state objects, so that they are of comparable magnitude.
        This procedure corresponds to the stochastic gradient update from
        `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation"
        <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_, see equations (5) and (9).

        Parameters
        ----------
        rhot : float
            Weight of the `other` state in the computed average. A value of 0.0 means that `other`
            is completely ignored. A value of 1.0 means `self` is completely ignored.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        """
        assert other is not None
        if targetsize is None:
            targetsize = self.numdocs

        # stretch the current model's expected n*phi counts to target size
        if self.numdocs == 0 or targetsize == self.numdocs:
            scale = 1.0
        else:
            scale = 1.0 * targetsize / self.numdocs
        self.sstats *= (1.0 - rhot) * scale

        # stretch the incoming n*phi counts to target size
        if other.numdocs == 0 or targetsize == other.numdocs:
            scale = 1.0
        else:
            scale = 1.0 * targetsize / other.numdocs
        self.sstats += rhot * scale * other.sstats
        self.numdocs = targetsize



In [7]:
def my_lda_func(corpus, num_topics, id2word, random_state=10,  passes=1, num_words=10,
                iterations=50, gamma_threshold=0.001, dtype=np.float32):
    num_terms = len(id2word)

    alpha = np.array( [1.0 / num_topics for i in range(num_topics)], dtype=dtype)

    eta = np.array( [1.0 / num_topics for i in range(num_terms)], dtype=dtype)

    rand  = np.random.RandomState(random_state)

    model_states = LdaState(eta, (num_topics, num_terms), dtype=dtype)
    model_states.sstats = rand.gamma(100., 1. / 100., (num_topics, num_terms))

    expElogbeta = np.exp(dirichlet_expectation(model_states.sstats))


    # Update
    lencorpus = len(corpus)
    model_states.numdocs += lencorpus
    num_updates = 0

    for pass_ in range(passes):
        other = LdaState(eta, (num_topics, num_terms), dtype=dtype)

        # Do E-step
        if lencorpus > 1:
            print(f'performing inference on a chunk of {lencorpus} documents')
        else:
            raise

        # Initialize the variational distribution q(theta|gamma) for the chunk
        gamma = rand.gamma(100., 1. / 100., (lencorpus, num_topics)).astype(dtype, copy=False)
        tmpElogtheta = dirichlet_expectation(gamma)
        tmpexpElogtheta = np.exp(tmpElogtheta)
        sstats = np.zeros_like(expElogbeta, dtype=dtype)
        converged = 0

        # Now, for each document d update that document's gamma and phi
        epsilon = 1e-7

        for d, doc in enumerate(corpus):
            ids = [idx for idx, _ in doc]
            cts = np.fromiter((cnt for _, cnt in doc), dtype=dtype, count=len(doc))
            gammad = gamma[d, :]
            Elogthetad = tmpElogtheta[d, :]
            expElogthetad = tmpexpElogtheta[d, :]
            expElogbetad = expElogbeta[:, ids]

            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
            # phinorm is the normalizer.
            phinorm = np.dot(expElogthetad, expElogbetad) + epsilon

            for _ in range(iterations):
                lastgamma = gammad
                # We represent phi implicitly to save memory and time.
                # Substituting the value of the optimal phi back into
                # the update for gamma gives this update. Cf. Lee&Seung 2001.
                gammad = alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
                Elogthetad = dirichlet_expectation(gammad)
                expElogthetad = np.exp(Elogthetad)
                phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
                # If gamma hasn't changed much, we're done.
                meanchange = np.mean(np.abs(gammad - lastgamma))
                if meanchange < gamma_threshold:
                    converged += 1
                    break
            gamma[d, :] = gammad
            sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)

        if len(corpus) > 1:
            print(f"{converged}/{len(corpus)} documents converged within {iterations} iterations")

        sstats *= expElogbeta

        other.sstats += sstats
        other.numdocs += gamma.shape[0]

        # Do M-step
        print('Update topics')
        previous_Elogbeta = model_states.get_Elogbeta()
        rho = 1
        model_states.blend(rho, other)

        current_Elogbeta = model_states.get_Elogbeta()
        #Propagate the states topic probabilities to the inner object's attribute.
        expElogbeta = np.exp(current_Elogbeta)

        diff = np.mean(np.abs(previous_Elogbeta.ravel() - current_Elogbeta.ravel()))
        print(f"topic diff {diff}")
        num_updates += other.numdocs

    shown = []
    topic = model_states.get_lambda()

    for i in range(num_topics):
        topic_ = topic[i]
        topic_ = topic_ / topic_.sum()  # normalize to probability distribution
        bestn = topic_.argsort()[-num_words:][::-1]

        topic_ = [(id2word[id], topic_[id]) for id in bestn]
        topic_ = ' + '.join('%.3f*"%s"' % (v, k) for k, v in topic_)
        shown.append((i, topic_))

    return shown

In [8]:
tf_df, id2word = tf([d1, d2, d3, d4, d5])

lil = []
for row in tf_df.T.values:
    lil_sub = []
    for idx, item in enumerate(row):
        if item:
            lil_sub.append((idx, item))
    lil.append(lil_sub)
    
my_lda_func(corpus=lil, num_topics=2, id2word=id2word, num_words=5)

performing inference on a chunk of 134 documents
129/134 documents converged within 50 iterations
Update topics
topic diff 1.8223607426649084


[(0,
  '0.232*"a" + 0.138*"application" + 0.103*"and" + 0.099*"alice" + 0.088*"after"'),
 (1,
  '0.274*"application" + 0.103*"alice" + 0.061*"and" + 0.049*"a" + 0.045*"after"')]

In [9]:
## Try Gensim

In [10]:
from gensim.models import LdaModel

In [11]:
lda = LdaModel(corpus=lil, num_topics=2, id2word=id2word, random_state=10)
print(lda.print_topics( num_words=5))

[(0, '0.232*"a" + 0.138*"application" + 0.103*"and" + 0.099*"alice" + 0.088*"after"'), (1, '0.274*"application" + 0.103*"alice" + 0.060*"and" + 0.049*"a" + 0.045*"after"')]
