In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from scipy.special import digamma
from scipy.stats import dirichlet
import time
from scipy.special import gamma as Gamma

### Latent Dirichlet Allocation (LDA). 
Please download the pre-processed corpus from `data/lda`. From `ap.txt`, you can see the original Associated Press corpus. `ap.dat` are the processed data which you will work on. Each row in `ap.dat` represents a document. In total, we have $2,246$ documents. The first number in each row is the number of words in the document. The following are a list of  **word-id:count** items. Word-id starts with 0. The corresponding word list is given in `vocab.txt`. The first row corresponds to Word-id 0, second, Word-id 1, and continue. 
The first number in each row is the number of words in the document. The following are a list of word-id:count items. Word-id starts with 0. The corresponding word list is given in `vocab.txt`. The first row correspond to Word-id 0, second, Word-id 1, and continue.

In [3]:
def softmax(x):
    z = x - np.max(x)
    numerator = np.exp(z)
    denominator = np.sum(numerator)
    softmax = numerator/denominator
    return softmax

### Data preparation!

In [4]:
h = open("data/lda/vocab.txt", "r")
Vocab = h.read().splitlines()

In [5]:
Vocab[:5]

['i', 'new', 'percent', 'people', 'year']

In [6]:
V = len(Vocab)
V # number of vocabulary

10473

In [7]:
f = open("data/lda/ap.dat", "r")
list_Documents = f.read().splitlines()

In [8]:
M = len(list_Documents)
M # number of documnets

2246

In [10]:
def len_of_documents():
    l_o_d = []
    for i in range(M):
        l_o_d.append(int(list_Documents[i].split(' ')[0]))
    return l_o_d
lengths_of_documents = len_of_documents()

In [11]:
def _W_():
    W = []
    c = 0
    for m in range(M):
        l = lengths_of_documents[m]
        D = np.zeros(l).astype(int)
        L = list_Documents[m].split(' ')[1:]
        for w_n in L:
            w, n = w_n.split(':')
            w = int(w)
            n = int(n)
            D[c:c+n] = w
            c = c+ n
        W.append(D)
    return W

In [12]:
W = _W_()

In [13]:
W[0]

array([    0,  6144,  3586,  3586,     3,     4,  1541,     8,    10,
        3927,    12,    12,    12,    12,    12,    12,    12,  4621,
         527,  9232,  1112,  1112,    20,  2587,  6172, 10269, 10269,
          37,    42,  3117,  1582,  1585,  1585,  1585,   435,  9268,
        9268,  9268,   571,   571,    60,    61,    63,    63,    64,
          64,  5185,    11,  4683,   590,   590,  1103,  1103,   592,
        5718,  1623,  1623,  1624,  1624,  1624,  1624,    89,    89,
        6234,  8802,  1638,   103,   600,  9404,   106,  3691,   720,
        2672,   113,  2165,  5751,   123,   123,   123,  1148,   128,
         128,  1670,  1670,  4231,  1167,   144,   147,   149,   149,
         149,   149,   149,   149,   149,  3735,  3735,  5272,  5272,
        1732,   673,   673,  5282,    27,  1700,  9893,  9893,   166,
         167,   173,   174,  2224,  2248,   372,   372,   186,  4284,
        4284,  4284,  3450,  3450,   117,   117,   203,  2244,  5320,
         201,  4215,

In [14]:
list_Documents[0].split(' ')[:10]

['186',
 '0:1',
 '6144:1',
 '3586:2',
 '3:1',
 '4:1',
 '1541:1',
 '8:1',
 '10:1',
 '3927:1']

### ??

In [15]:
Dis = dirichlet(alpha = [.5,.5])

In [16]:
A = Dis.rvs(size = 5)
A

array([[0.20746881, 0.79253119],
       [0.25744619, 0.74255381],
       [0.02923932, 0.97076068],
       [0.67333925, 0.32666075],
       [0.72967768, 0.27032232]])

In [17]:
Dis.logpdf(x = A.T)

array([-0.24208105, -0.31742764,  0.63622827, -0.38756034, -0.33308357])

In [18]:
Dis.logpdf(A[0])

-0.2420810493857063

In [19]:
Dis.pdf(A.T)

array([0.78499255, 0.72801936, 1.88934134, 0.67871068, 0.71671029])

## DLA

In [20]:
K = 5 #number of topics
alpha = 1/K* np.ones(K)

In [21]:
phi = [1/K*np.ones((l, K)) for l in lengths_of_documents] # a list of l x K matrix: z_nj ~ Mult(phi[n][j])
psi = 1/V* np.ones((K,V))# an K x V matrix whose rows are topics: beta_k ~ a Fir(psi[k])
gamma = np.ones((M, K)) # M Mixture parameters: theta_i ~ Dir(gamma[i])

In [22]:
def phi_nj(gamma_n, psi, w_nj):
    dg_gamma_n = digamma(gamma_n) # (K,)
    dg_psi = digamma(psi[:,w_nj]) # (K,)
    return softmax(dg_gamma_n + dg_psi)

In [23]:
def gamma_n(alpha, phi_n):
    return alpha + phi_n.sum(axis = 0) # (K,)

In [24]:
def psi_(eta, phi):
    temp = eta*np.ones((K,V))
    for v in range(V):
        for n in range(M):
            J = (W[n] == v)
            temp[:,v] += phi[n][J,:].sum(axis = 0) # (k,)
    return temp

In [25]:
phi_nj(gamma[0], psi, w_nj=3)

array([0.2, 0.2, 0.2, 0.2, 0.2])

In [26]:
for n in range(M):
    for j in range(lengths_of_documents[n]):
        phi[n][j] = phi_nj(gamma[n], psi, W[n][j])

In [27]:
t = time.time()
psi =  psi_(eta=1/V, phi = phi)
print(time.time()-t)

147.2027997970581


In [28]:
psi[0]

array([6.03692001e+04, 9.54836246e-05, 9.54836246e-05, ...,
       9.54836246e-05, 9.54836246e-05, 9.54836246e-05])

In [29]:
for n in range(M):
    gamma[n] = gamma_n(alpha, phi[n])

In [30]:
gamma[0]

array([37.4, 37.4, 37.4, 37.4, 37.4])

In [55]:
def ell(alpha):
    A = np.log(Gamma(alpha.sum()))
    B = np.sum(np.log(Gamma(alpha+1e-7)))
    C = np.sum(digamma(gamma+1e-7)@ (alpha -1))
    return A+B+C

In [56]:
def d_ell(alpha):
    return M * digamma(alpha.sum()) - M * digamma(alpha) + digamma(gamma).sum(axis =0)

In [57]:
ell(10*np.ones(5))

312408.7650221522

In [58]:
d_ell(10*np.ones(5))

array([10644.21447865, 10644.21447865, 10644.21447865, 10644.21447865,
       10644.21447865])

In [75]:
def constraint1(alpha):
    return alpha - 1e-3
def constraint2(alpha):
    return alpha.sum() - 1
cons = [{'type':'ineq', 'fun': constraint1}]

In [87]:
alpha = np.array([1, 2, 3, 4, 5])
minimize(ell, alpha, jac = d_ell, constraints = cons)

  A = np.log(Gamma(alpha.sum()))
  B = np.sum(np.log(Gamma(alpha+1e-7)))
  B = np.sum(np.log(Gamma(alpha+1e-7)))
  A = np.log(Gamma(alpha.sum()))


     fun: -inf
     jac: array([32901.33942471,  4233.36686244,  2840.41848468,  9767.65852805,
       13115.53260162])
 message: 'Positive directional derivative for linesearch'
    nfev: 443
     nit: 47
    njev: 43
  status: 8
 success: False
       x: array([-24643.89870251, -31174.25773155, -36545.23494319, -41483.4754476 ,
       -46189.62963799])

In [84]:
model.x/model.x.sum()

array([0.2141476 , 0.21416492, 0.31227514, 0.12970361, 0.12970873])

In [85]:
model.x/model.x.sum()

array([0.2141476 , 0.21416492, 0.31227514, 0.12970361, 0.12970873])

In [105]:
def ell_eta(eta):
    return K * np.log(Gamma(V*eta)) - V*K*Gamma(eta) + (eta-1) * digamma(psi).sum()

In [106]:
def d_ell_eta(eta):
    return K*V * digamma(V*eta) - V*K*digamma(eta)

In [116]:
def const1(eta):
    return eta -1e-5 
cons_eta = [{'type':'ineq', 'fun': const1}]

In [118]:
eta = 1e-20
minimize(ell_eta, eta, jac = d_ell_eta, constraints = cons_eta)

     fun: -5.236499999999999e+24
     jac: array([5.236e+24])
 message: 'Inequality constraints incompatible'
    nfev: 1
     nit: 1
    njev: 1
  status: 4
 success: False
       x: array([1.e-20])