# LAS VEGAS: LDA

In [1]:
import sys
sys.path.append('/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages')

In [2]:
import LDA
import lda
import time
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import psi
import collections
import json
from scipy import sparse
import sklearn.cluster
import sklearn.decomposition

### Loading the data:

Loading the data for Las vegas:

In [4]:
with open('../Zipped_data_LV/lasvegas_word_adjective.json', 'r') as fp:
    lv_wadj = json.load(fp)

Creating the vocabulary list with words present at least 10 times in all the reviews:

In [6]:
vocab = []
for i in lv_wadj.keys():
    vocab.extend(lv_wadj[i][0][0])
import collections
lv_vocab10 = []
d = collections.Counter(vocab)
for w in vocab:
    if d[w]>=10:
        lv_vocab10.append(w)
lv_vocab10 = list(set(lv_vocab10)) 

In [7]:
len(lv_vocab10)

10364

Creating dictionaries to map bid to an index and words to index:

In [9]:
lv_word_to_index  = dict(zip(lv_vocab10,range(len(lv_vocab10))))
lv_index_to_word = dict(zip(range(len(lv_vocab10)),lv_vocab10))
lv_bid_to_index = dict(zip(lv_wadj.keys(),range(len(lv_wadj.keys()))))
lv_index_to_bid = dict(zip(range(len(lv_wadj.keys())),lv_wadj.keys()))

Loading the dtm:

In [10]:
lv_dtm = np.load('../Zipped_data_LV/lasvegas_dtm.npy')

In [11]:
lv_dtm.shape

(3822, 10364)

### DTM TRAIN AND TEST:

In [21]:
reduced_lv = {}
it = 0
for (k,v) in lv_wadj.items():
    if it % 50 == 0:
        print it
    new = []
    for w in v[0][0]:
        if w in lv_vocab10:
            new.append(w)
    reduced_lv[k] = new
    it+=1

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800


In [49]:
train = {}
test = {}

for k in reduced_lv.keys():
    spl = np.split(range(len(reduced_lv[k])),[int(np.floor(0.8*len(reduced_lv[k]))),len(reduced_lv[k])])
    train[k] = np.array(reduced_lv[k])[spl[0]]
    test[k] = np.array(reduced_lv[k])[spl[1]]

In [50]:
from scipy import sparse

nonzero_data_tr = []
rows_s_tr = []
cols_s_tr = []
nonzero_data_ts = []
rows_s_ts = []
cols_s_ts = []

for k in reduced_lv.keys():
    
    counter_tr = collections.Counter(train[k])
    nonzero_data_tr += counter_tr.values()
    rows_s_tr += [lv_bid_to_index[k]]*len(counter_tr.values())
    cols_s_tr += [lv_word_to_index[ck] for ck in counter_tr.keys()]
    
    counter_ts = collections.Counter(test[k])
    nonzero_data_ts += counter_ts.values()
    rows_s_ts += [lv_bid_to_index[k]]*len(counter_ts.values())
    cols_s_ts += [lv_word_to_index[ck] for ck in counter_ts.keys()]

sparse_mat_tr = sparse.csc_matrix((nonzero_data_tr,(rows_s_tr,cols_s_tr)),shape = (len(lv_bid_to_index),len(lv_word_to_index)))
sparse_mat_ts = sparse.csc_matrix((nonzero_data_ts,(rows_s_ts,cols_s_ts)),shape = (len(lv_bid_to_index),len(lv_word_to_index)))

In [51]:
dtm_lv_train = sparse_mat_tr.toarray()
dtm_lv_test = sparse_mat_ts.toarray()

### Loading the lda functions:

In [59]:
def rho(tau,kappa,t):
	return pow(tau + t, - kappa)

def digamma(mat):
	if (len(mat.shape) == 1):
		return(psi(mat) - psi(np.sum(mat)))
	else:
		return(psi(mat) - psi(np.sum(mat, 0))[np.newaxis,:])

In [54]:
def lda_batch(dtm,ntopic,batch_size,tau,kappa):
	nvoc = dtm.shape[1]
	ndoc = dtm.shape[0]
	nu = 1./ntopic
	alpha = 1./ntopic

	topics = np.random.gamma(100.,1./100.,(nvoc,ntopic))
	gamma  = np.random.gamma(100.,1./100.,(ndoc,ntopic))

	numbatch = ndoc / batch_size
	batches = np.array_split(range(ndoc),numbatch)


	for it_batch in range(numbatch):
		ELogBeta = digamma(topics)
		ExpELogBeta = np.exp(ELogBeta)
		
		temp_topics = np.zeros(topics.shape)

		indices = []

		for d in batches[it_batch]:
			# print d
			ids = np.nonzero(dtm[d,:])[0]
			indices.extend(ids)
			cts = dtm[d,ids]
			ExpELogBetad = ExpELogBeta[ids,:]

			gammad = gamma[d,:]
			ElogTethad = digamma(gammad)
			ExpLogTethad = np.exp(ElogTethad)

			# print gammad

			for inner_it in range(1000):
				
				oldgammad = gammad

				phi =  ExpLogTethad * ExpELogBetad
				phi = phi / (phi.sum(axis=1)+0.00001)[:, np.newaxis]

				gammad = alpha + np.dot(cts,phi)

				ElogTethad = digamma(gammad)
				ExpLogTethad = np.exp(ElogTethad)
				# print gammad

				if np.mean((gammad-oldgammad)**2)<0.0000001:
					break

			#print inner_it
			gamma[d,:] = gammad

			temp_topics[ids,:] += phi * cts[:,np.newaxis]

		indices = np.unique(indices)

		rt = rho(tau,kappa,it_batch)

		topics[indices] = (1 - rt) * topics[indices,:] + rt * ndoc * (nu + temp_topics[indices,:]) / len(batches[it_batch])

	return topics,gamma

In [60]:
def inference(lda,dtm,tau,kappa):

	ntopic = lda[0].shape[1]
	nvoc = dtm.shape[1]
	ndoc = dtm.shape[0]
	nu = 1./ntopic
	alpha = 1./ntopic

	topics = lda[0]
	phi = np.random.gamma(100.,1./100.,(nvoc,ntopic))
	gamma  = np.random.gamma(100.,1./100.,(ndoc,ntopic))

	numbatch = ndoc
	batches = np.array_split(range(ndoc),numbatch)

	for i in range(1):
		for it_batch in range(numbatch):
			ELogBeta = digamma(topics)
			ExpELogBeta = np.exp(ELogBeta)
			
			temp_topics = np.zeros(topics.shape)

			indices = []

			for d in batches[it_batch]:
				# print d
				ids = np.nonzero(dtm[d,:])[0]
				indices.extend(ids)
				cts = dtm[d,ids]
				ExpELogBetad = ExpELogBeta[ids,:]

				gammad = gamma[d,:]
				ElogTethad = digamma(gammad)
				ExpLogTethad = np.exp(ElogTethad)

				# print gammad

				for inner_it in range(1000):
					
					oldgammad = gammad

					phi =  ExpLogTethad * ExpELogBetad
					phi = phi / (phi.sum(axis=1)+0.00001)[:, np.newaxis]

					gammad = alpha + np.dot(cts,phi)

					ElogTethad = digamma(gammad)
					ExpLogTethad = np.exp(ElogTethad)
					# print gammad

					if np.mean((gammad-oldgammad)**2)<0.0000001:
						break

				# print inner_it
				gamma[d,:] = gammad

				temp_topics[ids,:] += phi * cts[:,np.newaxis]

			indices = np.unique(indices)

			rt = rho(tau,kappa,it_batch)

			topics[indices] = (1 - rt) * topics[indices,:] + rt * ndoc * (nu + temp_topics[indices,:]) / len(batches[it_batch])

	return topics,gamma

In [65]:
def perplexity(lda,newdocs,tau,kappa,perword = False):
	
	new = inference(lda,newdocs,tau,kappa)
	
	topics = new[0]
	gammas = new[1]
	
	topics = topics/topics.sum(axis=0)
	
	if len(gammas.shape) == 1:
		gammas = gammas/np.sum(gammas)
		doc_idx = np.nonzero(newdocs)[0]
		doc_cts = newdocs[doc_idx]
		return np.exp(-np.log(np.sum(np.dot(topics[doc_idx,:],gammas)*doc_cts))/np.sum(doc_cts))
	
	else:
		norm = lambda x: x/np.sum(x)
		gammas = np.apply_along_axis(norm,axis = 1,arr = gammas)
		
		num = 0
		denom = 0
		
		for i in range(gammas.shape[0]):
			doc_idx = np.nonzero(newdocs[i,:])[0]
			doc_cts = newdocs[i,doc_idx]
			num = np.sum(np.log(np.dot(topics[doc_idx,:],gammas[i,:]))*doc_cts)
			denom += np.sum(doc_cts)
			
		if ~perword:
			return num
		else:
			return num/denom

In [86]:
batchsize = 30
kappa = 0.5
tau = 1024

K = range(30,80,10)

perplexity_dict = {}

In [87]:
%%time
for k in K:
    np.random.seed(0)
    train_lda = lda_batch(dtm_lv_train,k,batchsize,tau,kappa)
    perplexity_dict[k] = perplexity(train_lda,dtm_lv_test,tau,kappa,True)

CPU times: user 11min 51s, sys: 10.3 s, total: 12min 1s
Wall time: 12min 11s


In [88]:
best_k = max(perplexity_dict, key=perplexity_dict.get)

In [89]:
best_k

70

In [90]:
%%time
np.random.seed(0)
model = lda_batch(lv_dtm,best_k,batchsize,tau,kappa)

CPU times: user 1min 31s, sys: 2.5 s, total: 1min 34s
Wall time: 1min 34s


In [91]:
topic_word = model[0].T  # model.components_ also works
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(lv_vocab10)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(u'Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: consist membership actuality southwest pluss malt pil softball powder quacamole
Topic 1: pupusa win-win heffer dawn hurdle puzzle bellman glamour heath splurge
Topic 2: spacing mackarel cutlery fu green garage stomache propane mess maitake
Topic 3: document pisco gazpacho half-roll godzilla length engagement paymon feat collar
Topic 4: happiness orange carry grimaldis pop overeating wood-fire hipster meat duper
Topic 5: basketball changer skanky goat arancine epitome methink skepticism habitat gelatos
Topic 6: hurdle mackarel overload grate variant lark education wort ashtray use
Topic 7: deed sourdough coffeehouse pisco document bundle stability dollar gelee ta
Topic 8: overload motorcycle jap meatless tarter cavier chip reordering decision tab
Topic 9: splurge unison flagship poet trashy upside surrounding ass demi-glaze importance
Topic 10: break pisco saddest gang document date trudge pin suicide stability
Topic 11: goody becasue hungover thickness mullet offender taziki d