In [51]:
import pandas as pd
import numpy as np
import re
from scipy.special import psi  # gamma function utils
from pprint import pprint
import gensim.corpora as corpora
from gensim.corpora import Dictionary
import nltk
from nltk.corpus import reuters
nltk.download('reuters')

[nltk_data] Downloading package reuters to /home/lyann/nltk_data...


True

In [69]:
stop_words = ["would", "could", "said", "u", "us", "also", "may", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [70]:
## Utils and Helper Class

def tf(docs):
    """
    This function is used to calculate the document-term matrix and id2word mapping
    """
    # Clean up the text
    docsc_clean = {}
    total_term = []
    for key, val in enumerate(docs):
        val_clean = re.findall(r'[a-z]+', val.lower())
        val_clean = [i for i in val_clean if i not in stop_words]
        docsc_clean[f'd{key}'] = val_clean
        total_term += val_clean

    total_term_unique = sorted(set(total_term))
    id2word = {idx: word for  idx, word in enumerate(total_term_unique)}

    # Count the number of occurrences of term i in document j
    for key, val in docsc_clean.items():
        word_dir = dict.fromkeys(total_term_unique, 0)
        for word in val:
            word_dir[word] += 1
        docsc_clean[key] = word_dir

    tf_df = pd.DataFrame.from_dict(docsc_clean, orient='index')

    return tf_df, id2word

def dirichlet_expectation(sstats):
    if len(sstats.shape) == 1:
        return psi(sstats) - psi(np.sum(sstats))
    else:
        return psi(sstats) - psi(np.sum(sstats, 1))[:, np.newaxis]
    
    
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

class LdaState:
    def __init__(self, eta, shape, dtype=np.float32):
        """
        Parameters
        ----------
        eta : numpy.ndarray
            The prior probabilities assigned to each term.
        shape : tuple of (int, int)
            Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
        dtype : type
            Overrides the numpy array default types.

        """
        self.eta = eta.astype(dtype, copy=False)
        self.sstats = np.zeros(shape, dtype=dtype)
        self.numdocs = 0
        self.dtype = dtype

    def get_lambda(self):
        """Get the parameters of the posterior over the topics, also referred to as "the topics".

        Returns
        -------
        numpy.ndarray
            Parameters of the posterior probability over topics.

        """
        return self.eta + self.sstats

    def get_Elogbeta(self):
        """Get the log (posterior) probabilities for each topic.

        Returns
        -------
        numpy.ndarray
            Posterior probabilities for each topic.
        """
        return dirichlet_expectation(self.get_lambda())

    def blend(self, rhot, other, targetsize=None):
        """Merge the current state with another one using a weighted average for the sufficient statistics.

        The number of documents is stretched in both state objects, so that they are of comparable magnitude.
        This procedure corresponds to the stochastic gradient update from
        `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation"
        <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_, see equations (5) and (9).

        Parameters
        ----------
        rhot : float
            Weight of the `other` state in the computed average. A value of 0.0 means that `other`
            is completely ignored. A value of 1.0 means `self` is completely ignored.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        """
        assert other is not None
        if targetsize is None:
            targetsize = self.numdocs

        # stretch the current model's expected n*phi counts to target size
        if self.numdocs == 0 or targetsize == self.numdocs:
            scale = 1.0
        else:
            scale = 1.0 * targetsize / self.numdocs
        self.sstats *= (1.0 - rhot) * scale

        # stretch the incoming n*phi counts to target size
        if other.numdocs == 0 or targetsize == other.numdocs:
            scale = 1.0
        else:
            scale = 1.0 * targetsize / other.numdocs
        self.sstats += rhot * scale * other.sstats
        self.numdocs = targetsize

In [71]:
def my_lda_func(corpus, num_topics, id2word, random_state=10,  passes=1, num_words=10,
                iterations=50, gamma_threshold=0.001, dtype=np.float32,  chunksize=100, topics_only=True, verbose=False):
    num_terms = len(id2word)

    alpha = np.array( [1.0 / num_topics for i in range(num_topics)], dtype=dtype)

    eta = np.array( [1.0 / num_topics for i in range(num_terms)], dtype=dtype)

    rand  = np.random.RandomState(random_state)

    model_states = LdaState(eta, (num_topics, num_terms), dtype=dtype)
    model_states.sstats = rand.gamma(100., 1. / 100., (num_topics, num_terms))

    expElogbeta = np.exp(dirichlet_expectation(model_states.sstats))


    # Update
    lencorpus = len(corpus)
    chunksize = min(lencorpus, chunksize)
    model_states.numdocs += lencorpus
    num_updates = 0

    for pass_ in range(passes):
        all_chunks = chunks(corpus, chunksize)
        gamma_by_chunks = []
        for chunk_no, chunk in enumerate(all_chunks):
            other = LdaState(eta, (num_topics, num_terms), dtype=dtype)
            # Do estep
            if len(chunk) > 1:
                if verbose:
                    print(f'performing inference on a chunk of {len(chunk) } documents')
            else:
                raise

            # Initialize the variational distribution q(theta|gamma) for the chunk
            gamma = rand.gamma(100., 1. / 100., (len(chunk), num_topics)).astype(dtype, copy=False)
            tmpElogtheta = dirichlet_expectation(gamma)
            tmpexpElogtheta = np.exp(tmpElogtheta)
            sstats = np.zeros_like(expElogbeta, dtype=dtype)
            converged = 0

            # Now, for each document d update that document's gamma and phi
            epsilon = 1e-7

            for d, doc in enumerate(chunk):
                ids = [idx for idx, _ in doc]
                cts = np.fromiter((cnt for _, cnt in doc), dtype=dtype, count=len(doc))
                gammad = gamma[d, :]
                Elogthetad = tmpElogtheta[d, :]
                expElogthetad = tmpexpElogtheta[d, :]
                expElogbetad = expElogbeta[:, ids]

                # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
                # phinorm is the normalizer.
                phinorm = np.dot(expElogthetad, expElogbetad) + epsilon

                for _ in range(iterations):
                    lastgamma = gammad
                    # We represent phi implicitly to save memory and time.
                    # Substituting the value of the optimal phi back into
                    # the update for gamma gives this update. Cf. Lee&Seung 2001.
                    gammad = alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
                    Elogthetad = dirichlet_expectation(gammad)
                    expElogthetad = np.exp(Elogthetad)
                    phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
                    # If gamma hasn't changed much, we're done.
                    meanchange = np.mean(np.abs(gammad - lastgamma))
                    if meanchange < gamma_threshold:
                        converged += 1
                        break
                gamma[d, :] = gammad
                sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)
                gamma_by_chunks.append(gamma)

            if len(chunk) > 1:
                if verbose:
                    print(f"{converged}/{len(chunk)} documents converged within {iterations} iterations")

            sstats *= expElogbeta

            other.sstats += sstats
            other.numdocs += gamma.shape[0]

            # Do mstep
            if verbose:
                print('Update topics')
            previous_Elogbeta = model_states.get_Elogbeta()
            rho = pow(1 + pass_ + (num_updates / chunksize), -0.5)
            model_states.blend(rho, other)

            current_Elogbeta = model_states.get_Elogbeta()
            #Propagate the states topic probabilities to the inner object's attribute.
            expElogbeta = np.exp(current_Elogbeta)

            diff = np.mean(np.abs(previous_Elogbeta.ravel() - current_Elogbeta.ravel()))
            if verbose:
                print(f"topic diff {diff}")
            num_updates += other.numdocs

    shown = []
    topic = model_states.get_lambda()

    for i in range(num_topics):
        topic_ = topic[i]
        topic_ = topic_ / topic_.sum()  # normalize to probability distribution
        bestn = topic_.argsort()[-num_words:][::-1]

        topic_ = [(id2word[id], topic_[id]) for id in bestn]
        topic_ = ' + '.join('%.3f*"%s"' % (v, k) for k, v in topic_)
        shown.append((i, topic_))

    if topics_only:
        return shown
    else:
        return shown,gamma_by_chunks

## For Carol: Small test data (d1 to d5)

In [42]:
# Sample data for analysis
d1 = "Java is a language for programming that develops a software for several platforms. A compiled code or bytecode on Java application can run on most of the operating systems including Linux, Mac operating system, and Linux. Most of the syntax of Java is derived from the C++ and C languages."
d2 = "Python supports multiple programming paradigms and comes up with a large standard library, paradigms included are object-oriented, imperative, functional and procedural."
d3 = "Go is typed statically compiled language. It was created by Robert Griesemer, Ken Thompson, and Rob Pike in 2009. This language offers garbage collection, concurrency of CSP-style, memory safety, and structural typing."
d4 = "A young girl when she first visited magical Underland, Alice Kingsleigh (Mia Wasikowska) is now a teenager with no memory of the place -- except in her dreams."
d5 = "Her life takes a turn for the unexpected when, at a garden party for her fiance and herself, she spots a certain white rabbit and tumbles down a hole after him. Reunited with her friends the Mad Hatter (Johnny Depp), the Cheshire Cat and others, Alice learns it is her destiny to end the Red Queen's (Helena Bonham Carter) reign of terror."

In [43]:
# Using slow version tf_df
tf_df, id2word = tf([d1, d2, d3, d4, d5])

lil = []
for row in tf_df.values:
    lil_sub = []
    for idx, item in enumerate(row):
        if item:
            lil_sub.append((idx, item))
    lil.append(lil_sub)
    
shown, gamma_by_chunks = my_lda_func(corpus=lil, num_topics=2, id2word=id2word, topics_only=False, num_words=10)

In [44]:
shown

[(0,
  '0.020*"language" + 0.017*"alice" + 0.016*"memory" + 0.014*"compiled" + 0.013*"concurrency" + 0.013*"go" + 0.013*"safety" + 0.013*"griesemer" + 0.013*"collection" + 0.013*"csp"'),
 (1,
  '0.029*"java" + 0.021*"operating" + 0.021*"paradigms" + 0.021*"linux" + 0.021*"c" + 0.021*"programming" + 0.017*"language" + 0.014*"compiled" + 0.013*"systems" + 0.013*"mac"')]

In [45]:
gamma_by_chunks

[array([[ 3.118508 , 24.881    ],
        [ 2.168894 , 14.830803 ],
        [20.468893 ,  3.5306888],
        [ 7.332505 ,  8.667224 ],
        [25.308207 ,  8.691179 ]], dtype=float32),
 array([[ 3.118508 , 24.881    ],
        [ 2.168894 , 14.830803 ],
        [20.468893 ,  3.5306888],
        [ 7.332505 ,  8.667224 ],
        [25.308207 ,  8.691179 ]], dtype=float32),
 array([[ 3.118508 , 24.881    ],
        [ 2.168894 , 14.830803 ],
        [20.468893 ,  3.5306888],
        [ 7.332505 ,  8.667224 ],
        [25.308207 ,  8.691179 ]], dtype=float32),
 array([[ 3.118508 , 24.881    ],
        [ 2.168894 , 14.830803 ],
        [20.468893 ,  3.5306888],
        [ 7.332505 ,  8.667224 ],
        [25.308207 ,  8.691179 ]], dtype=float32),
 array([[ 3.118508 , 24.881    ],
        [ 2.168894 , 14.830803 ],
        [20.468893 ,  3.5306888],
        [ 7.332505 ,  8.667224 ],
        [25.308207 ,  8.691179 ]], dtype=float32)]

## Simulated data (Sleep & Politics)

In [23]:
sleep = pd.read_csv('sleep_diet_exercise.csv', header=None)

In [24]:
docs = [i[0] for i in sleep.values]
len(docs)

30

In [41]:
tf_df, id2word = tf(docs)

lil = []
for row in tf_df.values:
    lil_sub = []
    for idx, item in enumerate(row):
        if item:
            lil_sub.append((idx, item))
    lil.append(lil_sub)
    
my_lda_func(corpus=lil, num_topics=2, id2word=id2word, num_words=10, chunksize=2, passes=4)

[(0,
  '0.069*"sleep" + 0.018*"exercise" + 0.009*"health" + 0.008*"studies" + 0.008*"quality" + 0.007*"adults" + 0.007*"disruption" + 0.007*"insomnia" + 0.006*"activity" + 0.006*"poor"'),
 (1,
  '0.009*"biden" + 0.008*"physical" + 0.008*"exercise" + 0.007*"found" + 0.006*"daytime" + 0.006*"osa" + 0.006*"pandemic" + 0.006*"activity" + 0.005*"change" + 0.005*"treatment"')]

## For Yingyu: Data Used by Paper

In [65]:
np.random.seed(1)
ntotal=1000
documents = reuters.fileids()
documents=np.random.choice(documents,ntotal)
docs=[reuters.raw(d) for d in documents]

In [66]:
len(docs)

1000

In [72]:
tf_df, id2word = tf(docs)

In [80]:
lil = []
for row in tf_df.values:
    lil_sub = []
    for idx, item in enumerate(row):
        if item:
            lil_sub.append((idx, item))
    lil.append(lil_sub)
    
my_lda_func(corpus=lil, num_topics=5, id2word=id2word, num_words=10, chunksize=20, passes=10)

[(0,
  '0.015*"tonnes" + 0.010*"trade" + 0.009*"japan" + 0.007*"ec" + 0.006*"countries" + 0.006*"department" + 0.006*"official" + 0.005*"sugar" + 0.005*"production" + 0.005*"states"'),
 (1,
  '0.031*"lt" + 0.017*"dlrs" + 0.016*"company" + 0.014*"corp" + 0.014*"shares" + 0.013*"inc" + 0.011*"offer" + 0.010*"pct" + 0.010*"stock" + 0.010*"share"'),
 (2,
  '0.082*"vs" + 0.076*"mln" + 0.048*"cts" + 0.041*"net" + 0.038*"dlrs" + 0.029*"loss" + 0.025*"shr" + 0.022*"lt" + 0.019*"profit" + 0.017*"year"'),
 (3,
  '0.034*"pct" + 0.028*"mln" + 0.027*"year" + 0.021*"billion" + 0.016*"dlrs" + 0.015*"oil" + 0.011*"last" + 0.011*"february" + 0.008*"january" + 0.007*"quarter"'),
 (4,
  '0.013*"bank" + 0.011*"market" + 0.010*"new" + 0.009*"rate" + 0.009*"rates" + 0.009*"dollar" + 0.007*"one" + 0.006*"meeting" + 0.006*"pct" + 0.006*"exchange"')]

## For Melody: Real world data (from Tweet)

In [81]:
# Real world sample data
raw_tweets = pd.read_csv('clean_tweets.csv')

tweets_list = raw_tweets.Tweets.values.tolist()

# Turn the list of string into a list of tokens
clean_tweets = [t.split(',') for t in tweets_list]

len(clean_tweets)

6000

In [82]:
id2word = Dictionary(clean_tweets)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in clean_tweets]

In [84]:
pprint(my_lda_func(corpus=corpus, num_topics=5, id2word=id2word, num_words=10, chunksize=100, passes=10))

[(0,
  '0.081*"follow" + 0.073*"check" + 0.046*"automatically" + 0.038*"people" + '
  '0.029*"unfollowed" + 0.026*"person" + 0.019*"follower" + 0.008*"unfollower" '
  '+ 0.008*"wind" + 0.008*"mph"'),
 (1,
  '0.015*"trump" + 0.009*"people" + 0.009*"say" + 0.007*"think" + 0.006*"know" '
  '+ 0.005*"need" + 0.005*"right" + 0.005*"woman" + 0.004*"vote" + 0.004*"go"'),
 (2,
  '0.013*"good" + 0.011*"go" + 0.010*"love" + 0.010*"day" + 0.009*"time" + '
  '0.009*"thank" + 0.008*"think" + 0.008*"look" + 0.008*"year" + 0.007*"know"'),
 (3,
  '0.016*"thank" + 0.014*"new" + 0.012*"great" + 0.008*"how" + 0.007*"work" + '
  '0.006*"late" + 0.005*"help" + 0.005*"learn" + 0.005*"look" + 0.004*"good"'),
 (4,
  '0.125*"more" + 0.037*"today" + 0.036*"video" + 0.034*"like" + '
  '0.016*"cancer" + 0.012*"arie" + 0.009*"pisce" + 0.009*"aquarius" + '
  '0.006*"capricorn" + 0.006*"feel"')]


## Compare with Gensim

In [13]:
from gensim.models import LdaModel

In [14]:
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=10,
                   chunksize=100,
#                    alpha='auto',
#                    per_word_topics=True
                    )

In [15]:
pprint(lda_model.print_topics())

[(0,
  '0.021*"portfolio" + 0.013*"employment" + 0.010*"nursing" + 0.009*"repair" + '
  '0.008*"prevention" + 0.008*"command" + 0.008*"consultation" + '
  '0.007*"terminal" + 0.007*"briefly" + 0.007*"shall"'),
 (1,
  '0.029*"trump" + 0.011*"say" + 0.009*"vote" + 0.007*"country" + 0.006*"ban" '
  '+ 0.006*"people" + 0.006*"order" + 0.006*"need" + 0.005*"state" + '
  '0.005*"right"'),
 (2,
  '0.021*"thank" + 0.018*"great" + 0.016*"good" + 0.012*"look" + 0.010*"year" '
  '+ 0.009*"today" + 0.009*"time" + 0.009*"day" + 0.009*"love" + 0.008*"new"'),
 (3,
  '0.013*"new" + 0.010*"how" + 0.007*"work" + 0.006*"learn" + 0.006*"great" + '
  '0.006*"change" + 0.006*"need" + 0.005*"business" + 0.005*"help" + '
  '0.005*"social"'),
 (4,
  '0.007*"cove" + 0.000*"killing" + 0.000*"peggys" + 0.000*"lagoon" + '
  '0.000*"infamous" + 0.000*"fate" + 0.000*"await" + 0.000*"creek" + '
  '0.000*"maiden" + 0.000*"roaster"'),
 (5,
  '0.157*"more" + 0.053*"today" + 0.018*"cancer" + 0.017*"pisce" + '
  '0.013*"c