### READ RESULTS

In [1]:

### WE SHOULD HAVE 3 DIFF SAMPLES
path_data_file = "../DataProject/reviews_clean.p"

name_model_pickle_lda = "u_lda10k_lda.p"
name_model_pickle_slda = "r_slda10k_slda.p"
name_model_pickle_dmr = "r_dmr10k_dmr.p"

num_words_to_compare = 10

import pandas as pd
import lda

## You should have these scripts
import vocabulary as v
import numpy as np


In [2]:

data = pd.read_pickle(path_data_file)

result_lda = pd.read_pickle(name_model_pickle_lda)
result_slda = pd.read_pickle(name_model_pickle_lda)
result_dmr = pd.read_pickle(name_model_pickle_lda)

voca = v.Vocabulary()
docs = voca.read_corpus( data['text'] )


### LDA OUTPUT

In [3]:
print "--- LDA OUTPUT"

print "Alpha", result_lda.alpha
print "Beta", result_lda.beta
print "Docs with words", len(result_lda.docs)
print len(result_lda.topicdist()[0]), "topics per doc", len(result_lda.topicdist())
print len(result_lda.worddist()[0]), "word-assignment per topic", len(result_lda.worddist())
print "Voc size", voca.size()

--- LDA OUTPUT
Alpha 0.1
Beta 0.01
Docs with words 7993
10 topics per doc 7993
34174 word-assignment per topic 10
Voc size 37929


#### Example of document training

In [4]:

example = np.random.randint(0,len(result_lda.docs))

print "--- S T A R T   E X A M P L E"
print "Text\n",data.text[example], "\n"
print len(result_lda.docs[example]), "' Words\n", result_lda.docs[example], "\n"
print len(result_lda.topicdist()[example]), "' Topic probabilities\n", result_lda.topicdist()[example], "\n"

print "Topic-assignment sorted:\n"
dtype_list = [('weight', float), ('topic', float)] 
topic_assign = np.array( zip(result_lda.topicdist()[example],range(0,len(result_lda.topicdist()[example]))), 
                        dtype=dtype_list)
topic_assign = np.sort(topic_assign,order='weight')[::-1]
print topic_assign
print "---- E N D   E X A M P L E"


--- S T A R T   E X A M P L E
Text
[u'3', u'5', u'stars', u'nice', u'bar', u'flew', u'met', u'friends', u'night', u'since', u'getting', u'married', u'next', u'day', u'probably', u'20', u'us', u'dressed', u'looking', u'like', u'got', u'plane', u'cool', u'setup', u'water', u'features', u'place', u'fiber', u'optics', u'service', u'great', u'glasses', u'wine', u'know', u'beer', u'guy', u'relaxed', u'hung', u'midnight', u'service', u'kept', u'us', u'admit', u'cool', u'setup', u'one', u'thing', u'since', u'pre', u'wedding', u'sort', u'party', u'idea', u'prices', u'like', u'fun', u'laid', u'back', u'time', u'friends', u'good', u'place'] 

31 ' Words
[1695, 229, 11329, 11813, 2437, 127, 107, 2673, 2384, 99, 2065, 54, 248, 119, 230, 1177, 390, 665, 1164, 194, 823, 34, 1251, 119, 479, 99, 21, 40, 386, 3305, 12202] 

10 ' Topic probabilities
[ 0.003125  0.003125  0.503125  0.003125  0.003125  0.065625  0.065625
  0.253125  0.096875  0.003125] 

Topic-assignment sorted:

[(0.503125, 2.0) (0.253125

In [5]:

def compute_distances(dataset, distance_measures, n, column = 'index', name=0 ):
    '''
    dataset -- weights of words (same length of columns in each cluster)
    
    name -- name of cluster (number assignment)
    
    distance_measures -- list of [ ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, 
    ‘euclidean’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, 
    ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’ ]
    
    n -- number of most similar that we want to get
    
    Example: compute_distance('id_example', ['euclidean'], 6)
    
    '''
    distances = pd.DataFrame()
    from scipy.spatial import distance
    from scipy.spatial.distance import pdist, squareform
    
    ## Find the location (row) - topic - we are looking for
    ## (include "name in parameters if we want two particuar rows of a dataframe)
    if column == 'index':
        id_location = np.where(dataset.index == name)[0][0]
    else:
        id_location = np.where(dataset[column] == name)[0][0]

    # Go through all distance measures we care about
    print n, "' Clusters that are closer to topic =", name
    print "Format: (cluster number, similarity measure)"
    
    for distance_measure in distance_measures:
    
        # Find all pairwise distances
        current_distances = distance.squareform(distance.pdist(dataset, distance_measure))
        # Get the closest n elements for the whiskey we care about
        most_similar = np.argsort(current_distances[:, id_location])[1:n+1]
        # Append results (a new column to the dataframe with the name of the measure)
        distances[distance_measure] = list(zip(data.index[most_similar], current_distances[most_similar, id_location]))
        
    return distances


#### Similarity and spearman different words

In [8]:

topwords = result_lda.word_dist_with_voca(voca, topk=num_words_to_compare)

#for i in topwords: print i, "TOP WORDS", topwords[i],"\n"
        
columns_names = ['W_lda_'+str(x) for x in range(num_words_to_compare)]
columns_names.insert(0,'Cluster_lda')

weights_topwords = pd.DataFrame(columns=columns_names).T
weight_words_topwords = pd.DataFrame(columns=columns_names).T
dataframe_topwords = pd.DataFrame(columns=columns_names).T

import operator

for i in range(result_lda.K):
    weight_list_i = list(np.sort(topwords[i].values())[::-1])
    weight_list_i.insert(0,i)
    weights_topwords[i] = weight_list_i
    word_list_i = list(np.sort(topwords[i].keys()))
    word_list_i.insert(0,i)
    dataframe_topwords[i] = word_list_i
    word_weights_i = list([ x[0] for x in sorted(topwords[i].items(), key=operator.itemgetter(1),reverse=1) ])
    word_weights_i.insert(0,i)
    weight_words_topwords[i] = word_weights_i
    
weights_topwords = weights_topwords.T
dataframe_topwords = dataframe_topwords.T
weight_words_topwords = weight_words_topwords.T


In [9]:

#### This dataset CALLED "WEIGHT_TOPWORDS" needs to have weights of words on diff clusters 
#### We want to compare each of this clusters not with themselves (LDA) BUT with the others
#### EXAMPLE: "WEIGHTS OF LDA CLUSTER-1"   AGAINST   "ALL CLUSTERS  -1 TO 10- OF SLDA/DMR WEIGHTS"
#### IN ORDER TO FIGURE OUT WHO IS THE MOST SIMILAR IN ANOTHER OUTPUT !!!!

compute_distances(weights_topwords, ['euclidean','cosine','correlation'], 5, column='Cluster_lda', name=1)


5 ' Clusters that are closer to topic = 1
Format: (cluster number, similarity measure)


Unnamed: 0,euclidean,cosine,correlation
0,"(0, 1.00004737315)","(2, 0.0001512739302)","(2, 9.75651190301e-05)"
1,"(2, 1.00015544675)","(4, 0.000232582333328)","(4, 0.000104704497876)"
2,"(3, 2.00002143401)","(3, 0.000353984133143)","(3, 0.000119601619214)"
3,"(4, 3.00015258257)","(9, 0.000478700197643)","(9, 0.000145457158564)"
4,"(5, 4.00004882693)","(8, 0.00053398772975)","(8, 0.00016099952919)"


In [10]:

weights_topwords 


Unnamed: 0,Cluster_lda,W_lda_0,W_lda_1,W_lda_2,W_lda_3,W_lda_4,W_lda_5,W_lda_6,W_lda_7,W_lda_8,W_lda_9
0,0.0,0.01805,0.01302,0.010459,0.010278,0.009769,0.0075,0.007318,0.007173,0.007082,0.006465
1,1.0,0.027067,0.009914,0.009862,0.009416,0.008288,0.008105,0.007501,0.007265,0.006977,0.006741
2,2.0,0.024266,0.024034,0.019381,0.011874,0.010038,0.009539,0.008077,0.007809,0.007738,0.007595
3,3.0,0.018924,0.013867,0.010043,0.008557,0.00767,0.007383,0.006676,0.006568,0.0064,0.005945
4,4.0,0.035467,0.023563,0.023316,0.019622,0.018062,0.017734,0.01642,0.013629,0.012972,0.012151
5,5.0,0.008704,0.007675,0.007174,0.006805,0.005935,0.005328,0.005328,0.005012,0.004484,0.004484
6,6.0,0.011822,0.01001,0.009786,0.007825,0.007769,0.007284,0.007265,0.007209,0.006107,0.006051
7,7.0,0.010069,0.009685,0.009398,0.009075,0.007802,0.007497,0.006896,0.006878,0.006347,0.006303
8,8.0,0.01885,0.013057,0.012547,0.011196,0.009245,0.007954,0.007744,0.007384,0.006874,0.006514
9,9.0,0.032193,0.025738,0.018239,0.016525,0.01224,0.011892,0.01066,0.00999,0.009937,0.00991


In [11]:

### TOP WORDS (RANKING WEIGHTS ORDER)
weight_words_topwords


Unnamed: 0,Cluster_lda,W_lda_0,W_lda_1,W_lda_2,W_lda_3,W_lda_4,W_lda_5,W_lda_6,W_lda_7,W_lda_8,W_lda_9
0,0,recommended,however,chair,setting,couches,months,men,fen,great,needed
1,1,text,told,chair,stevenonmill,retro,great,recommended,def,finding,perfect
2,2,batter,recommended,couches,great,service,back,months,wait,ignored,needed
3,3,batter,couches,brecksville,cut,chair,steamed,mercedes,sever,recommended,brews
4,4,gab,shops,massieve,breakfast,realize,juicy,provided,humble,shawarma,dinning
5,5,deserves,great,hair,imdb,estate,frazzled,second,def,dashed,wanted
6,6,great,texting,time,frock,mom,recommend,service,would,back,months
7,7,def,would,months,time,specific,noticed,chair,selection,metal,even
8,8,witnessed,hey,chair,recommended,son,group,girl,sashimi,couches,wraps
9,9,somewhat,mouth,25,alfresco,boy,japanese,vowed,rabe,indicating,takes


In [12]:

### TOP WORDS (ALPHABETICAL ORDER)
dataframe_topwords


Unnamed: 0,Cluster_lda,W_lda_0,W_lda_1,W_lda_2,W_lda_3,W_lda_4,W_lda_5,W_lda_6,W_lda_7,W_lda_8,W_lda_9
0,0,chair,couches,fen,great,however,men,months,needed,recommended,setting
1,1,chair,def,finding,great,perfect,recommended,retro,stevenonmill,text,told
2,2,back,batter,couches,great,ignored,months,needed,recommended,service,wait
3,3,batter,brecksville,brews,chair,couches,cut,mercedes,recommended,sever,steamed
4,4,breakfast,dinning,gab,humble,juicy,massieve,provided,realize,shawarma,shops
5,5,dashed,def,deserves,estate,frazzled,great,hair,imdb,second,wanted
6,6,back,frock,great,mom,months,recommend,service,texting,time,would
7,7,chair,def,even,metal,months,noticed,selection,specific,time,would
8,8,chair,couches,girl,group,hey,recommended,sashimi,son,witnessed,wraps
9,9,25,alfresco,boy,indicating,japanese,mouth,rabe,somewhat,takes,vowed
