In [1]:
import LDA
import time
import numpy as np
import matplotlib.pyplot as plt
import collections
import json
from scipy import sparse

In [90]:
import sys
sys.path.append('/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages')

## Preparing the data:

In [3]:
with open('temp/Edimbourg/Edi10.json', 'r') as fp:
    reduced_edi = json.load(fp)

with open('temp/Edimbourg/word_to_index.json', 'r') as fp:
    word_to_index = json.load(fp)

with open('temp/Edimbourg/index_to_word.json', 'r') as fp:
    index_to_word = json.load(fp)

with open('temp/Edimbourg/bid_to_index.json', 'r') as fp:
    bid_to_index = json.load(fp)

with open('temp/Edimbourg/index_to_bid.json', 'r') as fp:
    index_to_bid = json.load(fp)

In [6]:
all_words = []
for k in reduced_edi.keys():
    all_words.extend(reduced_edi[k])

In [61]:
vocab = list(set(all_words))

In [53]:
word_to_index = dict(zip(vocab,range(len(vocab))))
index_to_word = dict(zip(range(len(vocab)),vocab))

In [8]:
import collections
collections.Counter(all_words)



In [18]:
toremove = ['food','place','time','menu','restaurant','bit','staff','service','table','dish','meal','thing','price','day','person','option','friend','lot','way','quality','selection','area','year','minute','hour','location','door','point','yelp','horror','blah']

In [19]:
len(toremove)

31

In [62]:
from scipy import sparse

nonzero_data = []
rows_s = []
cols_s = []

for k in reduced_edi.keys():
    counter = collections.Counter(reduced_edi[k])
    for r in toremove:
        counter[r] = 0
    nonzero_data += list(np.floor(np.array(counter.values())*100/np.max(counter.values()))) #Rescaling to account for the important differences in the lengths of the reviews
    rows_s += [bid_to_index[k]]*len(counter.values())
    cols_s += [word_to_index[ck] for ck in counter.keys()]

sparse_mat = sparse.csc_matrix((nonzero_data,(rows_s,cols_s)),shape = (len(bid_to_index),len(word_to_index)))

dtm_edi = sparse_mat.toarray()

In [22]:
train = {}
test = {}

for k in reduced_edi.keys():
    spl = np.split(range(len(reduced_edi[k])),[int(np.floor(0.8*len(reduced_edi[k]))),len(reduced_edi[k])])
    train[k] = np.array(reduced_edi[k])[spl[0]]
    test[k] = np.array(reduced_edi[k])[spl[1]]

In [26]:
from scipy import sparse

nonzero_data_tr = []
rows_s_tr = []
cols_s_tr = []
nonzero_data_ts = []
rows_s_ts = []
cols_s_ts = []

for k in reduced_edi.keys():
    
    counter_tr = collections.Counter(train[k])
    for r in toremove:
        counter_tr[r] = 0
    nonzero_data_tr += counter_tr.values()
    rows_s_tr += [bid_to_index[k]]*len(counter_tr.values())
    cols_s_tr += [word_to_index[ck] for ck in counter_tr.keys()]
    
    counter_ts = collections.Counter(test[k])
    for r in toremove:
        counter_ts[r] = 0
    nonzero_data_ts += counter_ts.values()
    rows_s_ts += [bid_to_index[k]]*len(counter_ts.values())
    cols_s_ts += [word_to_index[ck] for ck in counter_ts.keys()]

sparse_mat_tr = sparse.csc_matrix((nonzero_data_tr,(rows_s_tr,cols_s_tr)),shape = (len(bid_to_index),len(word_to_index)))
sparse_mat_ts = sparse.csc_matrix((nonzero_data_ts,(rows_s_ts,cols_s_ts)),shape = (len(bid_to_index),len(word_to_index)))

In [29]:
dtm_train = sparse_mat_tr.toarray()
dtm_test = sparse_mat_ts.toarray()

## Loading the functions:

In [21]:
from scipy.special import psi

def rho(tau,kappa,t):
	return pow(tau + t, - kappa)

def digamma(mat):
	if (len(mat.shape) == 1):
		return(psi(mat) - psi(np.sum(mat)))
	else:
		return(psi(mat) - psi(np.sum(mat, 0))[np.newaxis,:])

def lda_batch(dtm,ntopic,batch_size,tau,kappa):
	nvoc = dtm.shape[1]
	ndoc = dtm.shape[0]
	nu = 1./ntopic
	alpha = 1./ntopic

	topics = np.random.gamma(100.,1./100.,(nvoc,ntopic))
	gamma  = np.random.gamma(100.,1./100.,(ndoc,ntopic))

	numbatch = ndoc / batch_size
	batches = np.array_split(range(ndoc),numbatch)


	for it_batch in range(numbatch):
		ELogBeta = digamma(topics)
		ExpELogBeta = np.exp(ELogBeta)
		
		temp_topics = np.zeros(topics.shape)

		indices = []

		for d in batches[it_batch]:
			# print d
			ids = np.nonzero(dtm[d,:])[0]
			indices.extend(ids)
			cts = dtm[d,ids]
			ExpELogBetad = ExpELogBeta[ids,:]

			gammad = gamma[d,:]
			ElogTethad = digamma(gammad)
			ExpLogTethad = np.exp(ElogTethad)

			# print gammad

			for inner_it in range(1000):
				
				oldgammad = gammad

				phi =  ExpLogTethad * ExpELogBetad
				phi = phi / (phi.sum(axis=1)+0.00001)[:, np.newaxis]

				gammad = alpha + np.dot(cts,phi)

				ElogTethad = digamma(gammad)
				ExpLogTethad = np.exp(ElogTethad)
				# print gammad

				if np.mean((gammad-oldgammad)**2)<0.0000001:
					break

			#print inner_it
			gamma[d,:] = gammad

			temp_topics[ids,:] += phi * cts[:,np.newaxis]

		indices = np.unique(indices)

		rt = rho(tau,kappa,it_batch)

		topics[indices] = (1 - rt) * topics[indices,:] + rt * ndoc * (nu + temp_topics[indices,:]) / len(batches[it_batch])

	return topics,gamma

def inference(lda,dtm,tau,kappa):

	ntopic = lda[0].shape[1]
	nvoc = dtm.shape[1]
	ndoc = dtm.shape[0]
	nu = 1./ntopic
	alpha = 1./ntopic

	topics = lda[0].copy()
	phi = np.random.gamma(100.,1./100.,(nvoc,ntopic))
	gamma  = np.random.gamma(100.,1./100.,(ndoc,ntopic))

	numbatch = ndoc
	batches = np.array_split(range(ndoc),numbatch)

	for it_batch in range(numbatch):
		ELogBeta = digamma(topics)
		ExpELogBeta = np.exp(ELogBeta)
		
		temp_topics = np.zeros(topics.shape)

		indices = []

		for d in batches[it_batch]:
			# print d
			ids = np.nonzero(dtm[d,:])[0]
			indices.extend(ids)
			cts = dtm[d,ids]
			ExpELogBetad = ExpELogBeta[ids,:]

			gammad = gamma[d,:]
			ElogTethad = digamma(gammad)
			ExpLogTethad = np.exp(ElogTethad)

			# print gammad

			for inner_it in range(1000):
				
				oldgammad = gammad

				phi =  ExpLogTethad * ExpELogBetad
				phi = phi / (phi.sum(axis=1)+0.00001)[:, np.newaxis]

				gammad = alpha + np.dot(cts,phi)

				ElogTethad = digamma(gammad)
				ExpLogTethad = np.exp(ElogTethad)
				# print gammad

				if np.mean((gammad-oldgammad)**2)<0.0000001:
					break

			#print inner_it
			gamma[d,:] = gammad

			temp_topics[ids,:] += phi * cts[:,np.newaxis]

		indices = np.unique(indices)
		if indices.size == 0:
			continue
		rt = rho(tau,kappa,it_batch)

		topics[indices] = (1 - rt) * topics[indices,:] + rt * ndoc * (nu + temp_topics[indices,:]) / len(batches[it_batch])

	return topics,gamma

def perplexity(lda,newdocs,tau,kappa,perword = False):
	
	new = inference(lda,newdocs,tau,kappa)
	
	topics = new[0]
	gammas = new[1]
	
	topics = topics/topics.sum(axis=0)
	
	if len(gammas.shape) == 1:
		gammas = gammas/np.sum(gammas)
		doc_idx = np.nonzero(newdocs)[0]
		doc_cts = newdocs[doc_idx]
		return np.exp(-np.log(np.sum(np.dot(topics[doc_idx,:],gammas)*doc_cts))/np.sum(doc_cts))
	
	else:
		norm = lambda x: x/np.sum(x)
		gammas = np.apply_along_axis(norm,axis = 1,arr = gammas)
		
		num = 0
		denom = 0
		
		for i in range(gammas.shape[0]):
			doc_idx = np.nonzero(newdocs[i,:])[0]
			doc_cts = newdocs[i,doc_idx]
			num = np.sum(np.log(np.dot(topics[doc_idx,:],gammas[i,:]))*doc_cts)
			denom += np.sum(doc_cts)
			
		if ~perword:
			return num
		else:
			return num/denom

In [33]:
%%time
batchsize = 40
Kappa = [0.5,0.6,0.7,0.8]
Tau = [64,128,256,512,1024]

K = range(10,60,10)

perplexity_dict = {}

for k in K:
    for kappa in Kappa:
        for tau in Tau:
            np.random.seed(0)
            train_lda = lda_batch(dtm_train,k,batchsize,tau,kappa)
            perplexity_dict[(k,tau,kappa)] = perplexity(train_lda,dtm_test,tau,kappa,True)

CPU times: user 12min 1s, sys: 4.1 s, total: 12min 5s
Wall time: 12min 13s


In [34]:
best_combi = max(perplexity_dict, key=perplexity_dict.get)

In [35]:
best_combi

(50, 128, 0.5)

In [84]:
%%time
np.random.seed(10)
model = lda_batch(dtm_edi,20,10,512,0.3)

CPU times: user 5.79 s, sys: 21.5 ms, total: 5.81 s
Wall time: 5.84 s


In [85]:
topic_word = model[0].T  # model.components_ also works
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(u'Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: writing doner dollop cream sake topping couscous indian introduction cake
Topic 1: ware fig truffle uni foot lemongrass cure bone fun freezer
Topic 2: imagination madness custard fisher weekend farmer refurbishment cupcake shortcake act
Topic 3: spot towel flesh hostess respect plan sweetness suggestion wonder poppadum
Topic 4: theme atmosphere bon plastic help meal wifi taco game coke
Topic 5: fee supper traveller pig leek bay treasure use suite castle
Topic 6: phone branch refurb killer list nachos sex loyalty wing whiskey
Topic 7: pretzel convenience let soul degree basket field flavour care team
Topic 8: minimum dollop entrance attraction bank deli visitor pair sort tonight
Topic 9: eel title armchair hummus goat half hassle industry soda chunk
Topic 10: chutney ball tiramisu grapefruit tastebuds spanish furniture clothe nachos temperature
Topic 11: bulgogi soul warmth bun pre-dinner cinnamon smoking pair specialise family
Topic 12: fan £15 cake italian gluten region lover

In [88]:
from sklearn.decomposition import NMF
nmf10 = NMF(n_components=20, init='random', random_state=0,max_iter=1500)
mat10 = nmf10.fit_transform(dtm_edi)
topic_word = nmf10.components_  # model.components_ also works
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(u'Topic {}: {}'.format(i, ' '.join(topic_words)))


Topic 0: attraction meringue brown benedict bang section tastiness tale min fun
Topic 1: theme trade meal basis atmosphere bon code entrance portion vinegar
Topic 2: chutney ball pine ware relax meringue button kebab fun painting
Topic 3: waiter flatmate deli pair barbecue noble host vindaloo loyalty bang
Topic 4: cake fan sake ware care £15 strawberry tonight region claw
Topic 5: mit chutney painting glance cash midnight relax clientele tastebuds quarter
Topic 6: writing £15 pair sunday italian painting fan second ball hotdog
Topic 7: green vin relax cash section discount painting hula evening think
Topic 8: tale code animal update route craic discount strawberry trade rice
Topic 9: minimum entrance visitor bang rum atmosphere silver attraction deli pair
Topic 10: sort bang update sin portion brown kebab flatmate half rice
Topic 11: tiramisu chutney ball tastebuds clothe painting grapefruit spanish furniture rear
Topic 12: section talk shame video error check fancy vindaloo warming di

In [93]:
import lda

mod = lda.LDA(20)
mod.fit(dtm_edi.astype(int))



<lda.lda.LDA instance at 0x10b25c200>

In [94]:
topic_word = mod.topic_word_   # model.components_ also works
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(u'Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: tiramisu chutney grapefruit ball tastebuds mit clothe painting peppercorn cash
Topic 1: bulgogi soul warmth atmosphere think pretzel update bun catch phone
Topic 2: heater soul shame latte pair title taco portion think answer
Topic 3: atmosphere code portion mit trade tale flatmate rice theme midnight
Topic 4: mit sort bang wedge eel rice sin atmosphere glance flatmate
Topic 5: attraction sort waiter kebab green writing sin think cash quarter
Topic 6: minimum entrance attraction bang sort waiter visitor deli pair writing
Topic 7: pretzel route convenience pie degree entrance update green heater trade
Topic 8: writing £15 cake dollop fan topping doner sake minimum almond
Topic 9: theme entrance atmosphere excellence bon basis trade think wifi portion
Topic 10: fee phone castle branch writing sake slow talk fancy pair
Topic 11: flatmate vindaloo strawberry trade impression hula section animal talk performance
Topic 12: theme atmosphere trade loo wedge think tale spot attraction 

In [11]:
np.save("CS281/temp/edi_topics",BEST_EDI[0])

In [None]:
np.save("CS281/temp/edi_assignment",BEST_EDI[1])

In [58]:
def clean_topics(lda,thresh1):
    topics = lda[0].copy()
    gammas = lda[1].copy()
    notflag_uni = []
    for i in range(topics.shape[1]):
        if 1 - spatial.distance.cosine(topics[:,i], [1./topics.shape[0]]*topics.shape[0])<thresh1:
            notflag_uni.append(i)
    newtopics = topics[:,notflag_uni]
    newgammas = gammas[:,notflag_uni]/gammas[:,notflag_uni].sum(axis=1)[:,np.newaxis]
    
    
    return newtopics,newgammas