# Simulate the data

In [22]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import dirichlet, multinomial
from scipy.sparse import lil_matrix
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

np.random.seed(37)

# number of topics
K = 10
# number of words
N = 100
# number of documents
M = 1000

# priors on K topics
a = np.array([0.1, 0.2, 0.3, 0.4, 0.025, 0.015, 0.37, 0.88, 0.03, 0.08])
# priors on N words
b = np.full((1, N), 0.001, dtype=float)[0]

# distribution of words in topic k
psi = np.array([dirichlet.rvs(b)[0] for _ in range(K)])

# distribution of topics in document d
theta = np.array([dirichlet.rvs(a)[0] for _ in range(M)])

# simulate the documents
docs = []
for i in range(M):
    d = {}
    stop = False
    for j in range(N):
        try:
            z_ij = multinomial.rvs(1, theta[i] / theta[i].sum())
            topic = np.argmax(z_ij)
        except:
            print(i)
            stop = True
            break
        
        try:
            w_ij = multinomial.rvs(1, psi[topic] / psi[topic].sum())
        except:
            print(topic)
            print(psi[topic].sum())
            print(psi[topic])
            stop = True
            break
        word = np.argmax(w_ij)
        
        if word not in d:
            d[word] = 0
        d[word] = d[word] + 1
    if stop == True:
        break
    docs.append(d)

# make a nice matrix
X = lil_matrix((M, N), dtype=np.int16)
for i, d in enumerate(docs):
    counts = sorted(list(d.items()), key=lambda tup: tup[0])
    for tup in counts:
        X[i, tup[0]] = tup[1]
        
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)

109


In [27]:
theta[109].sum()

0.9999999999999999

In [24]:
multinomial.rvs(1, theta[109] / theta[109].sum())

ValueError: pvals < 0, pvals > 1 or pvals contains NaNs

In [14]:
psi[5].sum()

1.0000000000000002

In [15]:
psi[4].sum()

0.9999999999999999

In [29]:
(theta[109] / theta[109].sum()).sum()

1.0000000000000002

In [32]:
s = theta[109].sum()
np.array([v / s for v in theta[109]]).sum()

1.0000000000000002

# Look at the distribution

In [None]:
means = X.mean(axis=0)
means = [(i, means[0, i]) for i in range(means.shape[1]) if means[0, i] > 0.0]

In [None]:
import math

n = len(means)
n_cols = 3
n_rows = math.ceil(n / n_cols)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(15, 20))
ax = np.ravel(ax)

for i in range(len(ax)):
    if i < len(means):
        idx = means[i][0]
        mu = means[i][1]
        data = X[:,idx].toarray().reshape(1, -1)[0]
        sns.distplot(data, ax=ax[i])
        ax[i].set_title('w{}, mean={:.2f}'.format(idx, mu))
        ax[i].set_ylabel('p')
        ax[i].set_xlabel('count')
    else:
        ax[i].axis('off')

plt.tight_layout()

# Look at the pairwise correlations

In [None]:
indices = sorted([tup[0] for tup in means])
df = pd.DataFrame(X[:, indices].toarray(), columns=['w{}'.format(i) for i in indices])
corr = df.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

# Gaussian mixture models

In [None]:
from scipy.sparse.linalg import svds

U, S, V = svds(X, k=20)

In [None]:
from sklearn.mixture import GaussianMixture

def get_gmm_labels(X, k):
    gmm = GaussianMixture(n_components=k, max_iter=200, random_state=37)
    gmm.fit(X)
    aic = gmm.aic(X)
    bic = gmm.bic(X)
    print('{}: aic={}, bic={}'.format(k, aic, bic))
    return aic, bic, k, gmm

gmm_scores = [get_gmm_labels(U, k) for k in range(2, 26)]

In [None]:
gmm = gmm_scores[14][3]

In [None]:
_x = [s[2] for s in gmm_scores]
_y = [s[0] for s in gmm_scores]

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(_x, _y, color='tab:blue')
ax.set_xlabel('k')
ax.set_ylabel('aic')
ax.set_title('AIC vs k')

# k-means clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def get_kmc(X, k):
    model = KMeans(k, random_state=37)
    model.fit(X)
    labels = model.predict(X)
    score = silhouette_score(X, labels)
    print('{}: score={}'.format(k, score))
    return score, k, model

kmc_scores = [get_kmc(X, k) for k in range(2, 26)]

In [None]:
_x = [s[1] for s in kmc_scores]
_y = [s[0] for s in kmc_scores]

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(_x, _y, color='tab:blue')
ax.set_xlabel('k')
ax.set_ylabel('score')
ax.set_title('Silhouette vs k')

# LDA

In [None]:
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel, LdaModel, HdpModel

def dict_to_text(d):
    def convert(p):
        w = 'w{} '.format(p[0])
        s = w * p[1]
        s = s.strip()
        return s
    
    pairs = list(d.items())
    return ' '.join([convert(p) for p in pairs])
        
T = [dict_to_text(docs[i]).split(' ') for i in range(len(docs))]

dictionary = corpora.Dictionary(T)
corpus = [dictionary.doc2bow(text) for text in T]

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
corpus_lsi = lsi[corpus_tfidf]
print(lsi.get_topics().shape)

In [None]:
lda = LdaModel(corpus, 
               id2word=dictionary, 
               num_topics=15, 
               random_state=37, 
               iterations=100,
               per_word_topics=True)
corpus_lda = lda[corpus]
print(lda.get_topics().shape)

In [None]:
hdp = HdpModel(corpus_tfidf, id2word=dictionary)
corpus_hdp = hdp[corpus_tfidf]
print(hdp.get_topics().shape)

In [None]:
lda.print_topics(5)