In [1]:
import numpy as np
import scipy
from scipy.stats import dirichlet, multinomial
import pandas as pd

In [2]:
df_data = pd.read_csv('uci-news-aggregator.csv')
df_data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
raw_documents = df_data['TITLE'].astype(str).apply(lambda x: x.lower()).array
raw_documents.shape, raw_documents[0:10]

((422419,), <PandasArray>
 [    'fed official says weak data caused by weather, should not slow taper',
        "fed's charles plosser sees high bar for change in pace of tapering",
     'us open: stocks fall after fed official hints at accelerated tapering',
                "fed risks falling 'behind the curve', charles plosser says",
                        "fed's plosser: nasty weather has curbed job growth",
                         'plosser: fed may have to accelerate tapering pace',
                                 "fed's plosser: taper pace may be too slow",
  "fed's plosser expects us unemployment to fall to 6.2% by the end of 2014",
    'us jobs growth last month hit by weather:fed president charles plosser',
              'ecb unlikely to end sterilisation of smp purchases - traders']
 Length: 10, dtype: object)

In [4]:
#Get first 1000 documents
raw_documents = raw_documents[:1000]

In [5]:
docs = [d.split() for d in raw_documents]
vocab = list(set(' '.join(raw_documents).split()))
vocab[:10],len(vocab)

(['hampshire',
  'brokerage',
  'up,',
  'escalate',
  'sentiment',
  'anniversary',
  'manager',
  'bought',
  'blog',
  'harlem'],
 2205)

In [6]:
#create word ids
mapped_docs = []
for doc in docs:
    new_doc = []
    vectorized_doc = doc 
    for i in range(len(doc)):
        vectorized_doc[i] = vocab.index(doc[i])
    mapped_docs.append(vectorized_doc)
len(mapped_docs), mapped_docs[:5]

(1000,
 [[1983, 1654, 1044, 1254, 1288, 861, 1346, 1000, 1248, 1975, 1918, 1200],
  [35, 57, 1080, 1595, 697, 139, 186, 1513, 38, 511, 2093, 2033],
  [95, 2091, 734, 188, 267, 1983, 1654, 1228, 2098, 1210, 2033],
  [1983, 491, 1973, 988, 2005, 2124, 57, 1080, 1044],
  [35, 1625, 1583, 553, 2137, 1623, 1915, 170]])

In [7]:
True in [(2204 in d) for d in mapped_docs]

True

In [8]:
len(vocab), len(mapped_docs)

(2205, 1000)

In [10]:
#Number of topics
K = 10

#topic-word matrix
tw_matrix = np.zeros((K,len(vocab)))

#topic assignment list
ta_list = [np.zeros((1,len(d)))[0] for d in docs]

#document-topic matrix
dt_matrix = np.zeros((len(docs),K))


In [11]:
#Randomly intitialize
np.random.seed(47)

for d in range(len(docs)):
    for w in range(len(mapped_docs[d])):
        #print(f'{d},{w}')
        ta_list[d][w] = np.random.randint(0,K)
        
        ti = int(ta_list[d][w])
        wi = int(mapped_docs[d][w])
        #print(f'{ti},{wi}')
        tw_matrix[ti, wi] = tw_matrix[ti][wi] + 1
    
    for t in range(K):
        #Number of words in document d with topic assignment t
        dt_matrix[d, t] = np.where(ta_list[d] == t)[0].shape[0] 

In [12]:
pd.DataFrame(tw_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2195,2196,2197,2198,2199,2200,2201,2202,2203,2204
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,...,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
pd.DataFrame(dt_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,5.0,2.0,0.0
1,0.0,2.0,3.0,0.0,1.0,1.0,0.0,2.0,1.0,2.0
2,2.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,0.0,2.0
3,0.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,3.0
4,0.0,1.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0
5,2.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
6,1.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,2.0
7,1.0,0.0,3.0,0.0,3.0,1.0,1.0,1.0,3.0,1.0
8,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,2.0
9,1.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,3.0,1.0


In [14]:
#Model paramters
alpha = 1
eta = 1
num_iterations = 35

In [15]:
ta_list[0],np.where(ta_list[0] == 0)[0].shape[0]

(array([7., 6., 7., 8., 8., 3., 0., 7., 0., 7., 7., 1.]), 2)

In [16]:
#calculating P(z_i|*)


#for every word in every document
for iteration in range(num_iterations):
    for d_i in range(len(mapped_docs)):
        for w_i in range(len(mapped_docs[d_i])):
            
            init_topic = int(ta_list[d_i][w_i])
            word_id = mapped_docs[d_i][w_i]


            #z_-i term,
            dt_matrix[d_i, init_topic] = max(dt_matrix[d_i, init_topic] - 1,0)
            tw_matrix[init_topic, word_id] = max(tw_matrix[init_topic, word_id] - 1,0)
            
            #if dt_matrix[d_i, init_topic] < 0 or tw_matrix[init_topic, word_id] < 0:
            #    print(f'error(dt,tw): {dt_matrix[d_i, init_topic]}, {tw_matrix[init_topic, word_id]}')

            #word topic means
            wt_means = (tw_matrix[:, word_id] + eta) / (tw_matrix.sum(axis=1) + len(vocab)*eta)
            dt_means = (dt_matrix[d_i,:]+alpha) / (dt_matrix[d_i,:].sum() + K*alpha )

            # P(z_i|.) = phi_k * theta_d 
            probs = wt_means*dt_means
            if d_i == 0:
                print(f'(document {d_i}, word {w_i}) \n word-topic:{wt_means}\n doc-topic{dt_means}\n word-prob{probs}')
            #Normalize, necessary due to rounding errors
            probs = probs/probs.sum()
            
            #Multinomial draws
            new_topic = int(np.random.choice(range(K),p=probs))
            dt_matrix[d_i, new_topic] = dt_matrix[d_i, new_topic] + 1
            tw_matrix[new_topic, w_i] = tw_matrix[new_topic, w_i] + 1
            ta_list[d_i][w_i] = new_topic
            




(document 0, word 0) 
 word-topic:[0.00032404 0.000666   0.00065232 0.0003269  0.00032531 0.00032457
 0.0003201  0.00032468 0.00032637 0.00064914]
 doc-topic[0.14285714 0.0952381  0.04761905 0.0952381  0.04761905 0.04761905
 0.0952381  0.23809524 0.14285714 0.04761905]
 word-prob[4.62920100e-05 6.34286349e-05 3.10626534e-05 3.11337350e-05
 1.54909068e-05 1.54557117e-05 3.04859460e-05 7.73036487e-05
 4.66243939e-05 3.09114233e-05]
(document 0, word 1) 
 word-topic:[0.00032404 0.000333   0.00032616 0.0003268  0.00065062 0.00032457
 0.0003202  0.00032468 0.00032637 0.00032457]
 doc-topic[0.14285714 0.0952381  0.04761905 0.14285714 0.04761905 0.04761905
 0.04761905 0.23809524 0.14285714 0.04761905]
 word-prob[4.62920100e-05 3.17143174e-05 1.55313267e-05 4.66853408e-05
 3.09818137e-05 1.54557117e-05 1.52478539e-05 7.73036487e-05
 4.66243939e-05 1.54557117e-05]
(document 0, word 2) 
 word-topic:[0.00097213 0.00033289 0.00065232 0.00163399 0.00065062 0.00097371
 0.00064041 0.00097434 0.002284

(document 0, word 0) 
 word-topic:[0.00025374 0.00025615 0.00025151 0.00025867 0.00025641 0.00025227
 0.000254   0.00025291 0.00025349 0.00025195]
 doc-topic[0.14285714 0.04761905 0.04761905 0.04761905 0.0952381  0.0952381
 0.0952381  0.23809524 0.0952381  0.0952381 ]
 word-prob[3.62489578e-05 1.21975020e-05 1.19766216e-05 1.23173946e-05
 2.44200244e-05 2.40257556e-05 2.41905246e-05 6.02162969e-05
 2.41414690e-05 2.39954888e-05]
(document 0, word 1) 
 word-topic:[0.00025368 0.00025615 0.00025151 0.00025867 0.00025641 0.00025227
 0.000254   0.00025291 0.00025349 0.00025195]
 doc-topic[0.19047619 0.04761905 0.04761905 0.04761905 0.04761905 0.0952381
 0.0952381  0.23809524 0.0952381  0.0952381 ]
 word-prob[4.83196830e-05 1.21975020e-05 1.19766216e-05 1.23173946e-05
 1.22100122e-05 2.40257556e-05 2.41905246e-05 6.02162969e-05
 2.41414690e-05 2.39954888e-05]
(document 0, word 2) 
 word-topic:[0.00025361 0.00025615 0.00025151 0.00025867 0.00025641 0.00025227
 0.000254   0.00025291 0.00025349

(document 0, word 0) 
 word-topic:[0.00017528 0.00017637 0.00017721 0.00017844 0.00017889 0.00017596
 0.0001764  0.00017519 0.00017712 0.00017507]
 doc-topic[0.14285714 0.04761905 0.04761905 0.0952381  0.23809524 0.0952381
 0.04761905 0.14285714 0.04761905 0.0952381 ]
 word-prob[2.50406911e-05 8.39842110e-06 8.43860493e-06 1.69946637e-05
 4.25930658e-05 1.67584190e-05 8.39990256e-06 2.50275303e-05
 8.43412108e-06 1.66733360e-05]
(document 0, word 1) 
 word-topic:[0.00017528 0.00017637 0.00017718 0.00017844 0.00017889 0.00017596
 0.0001764  0.00017519 0.00017712 0.00017507]
 doc-topic[0.14285714 0.04761905 0.0952381  0.0952381  0.23809524 0.0952381
 0.04761905 0.0952381  0.04761905 0.0952381 ]
 word-prob[2.50406911e-05 8.39842110e-06 1.68742196e-05 1.69946637e-05
 4.25930658e-05 1.67584190e-05 8.39990256e-06 1.66850202e-05
 8.43412108e-06 1.66733360e-05]
(document 0, word 2) 
 word-topic:[0.00017528 0.00017637 0.00017718 0.00017844 0.00017886 0.00017596
 0.0001764  0.00017519 0.00017712

(document 0, word 0) 
 word-topic:[0.00013367 0.00013574 0.00013508 0.00013682 0.000136   0.00013401
 0.0001357  0.00013563 0.00013701 0.00013355]
 doc-topic[0.14285714 0.04761905 0.0952381  0.14285714 0.0952381  0.0952381
 0.0952381  0.04761905 0.0952381  0.14285714]
 word-prob[1.90959956e-05 6.46383163e-06 1.28647974e-05 1.95453746e-05
 1.29522773e-05 1.27630790e-05 1.29241546e-05 6.45857149e-06
 1.30481018e-05 1.90781441e-05]
(document 0, word 1) 
 word-topic:[0.00013367 0.00013574 0.00013508 0.0001368  0.000136   0.00013401
 0.0001357  0.00013563 0.00013701 0.00013355]
 doc-topic[0.14285714 0.04761905 0.0952381  0.19047619 0.0952381  0.0952381
 0.0952381  0.04761905 0.0952381  0.0952381 ]
 word-prob[1.90959956e-05 6.46383163e-06 1.28647974e-05 2.60569344e-05
 1.29522773e-05 1.27630790e-05 1.29241546e-05 6.45857149e-06
 1.30481018e-05 1.27187627e-05]
(document 0, word 2) 
 word-topic:[0.00013367 0.00013574 0.00013508 0.00013678 0.000136   0.00013401
 0.0001357  0.00013563 0.00013701

(document 0, word 0) 
 word-topic:[0.00010839 0.00010972 0.00010883 0.00011096 0.00011007 0.00010915
 0.00010961 0.00011079 0.00010995 0.00010934]
 doc-topic[0.0952381  0.04761905 0.04761905 0.19047619 0.19047619 0.04761905
 0.14285714 0.04761905 0.0952381  0.0952381 ]
 word-prob[1.03227938e-05 5.22482418e-06 5.18217952e-06 2.11358400e-05
 2.09660089e-05 5.19745117e-06 1.56590094e-05 5.27576419e-06
 1.04714783e-05 1.04130872e-05]
(document 0, word 1) 
 word-topic:[0.00010839 0.00010972 0.00010883 0.00011096 0.00011006 0.00010915
 0.00010961 0.00011079 0.00010995 0.00010934]
 doc-topic[0.0952381  0.04761905 0.04761905 0.19047619 0.19047619 0.04761905
 0.14285714 0.04761905 0.0952381  0.0952381 ]
 word-prob[1.03227938e-05 5.22482418e-06 5.18217952e-06 2.11358400e-05
 2.09637014e-05 5.19745117e-06 1.56590094e-05 5.27576419e-06
 1.04714783e-05 1.04130872e-05]
(document 0, word 2) 
 word-topic:[0.00010839 0.00010971 0.00010883 0.00011096 0.00011006 0.00010915
 0.00010961 0.00011079 0.000109

(document 0, word 0) 
 word-topic:[9.15080527e-05 9.20047842e-05 9.12575287e-05 9.27557740e-05
 9.28160386e-05 9.18779860e-05 9.27385700e-05 9.32487878e-05
 9.22339052e-05 9.16674306e-05]
 doc-topic[0.04761905 0.14285714 0.04761905 0.33333333 0.0952381  0.04761905
 0.0952381  0.04761905 0.04761905 0.0952381 ]
 word-prob[4.35752632e-06 1.31435406e-05 4.34559661e-06 3.09185913e-05
 8.83962272e-06 4.37514219e-06 8.83224476e-06 4.44041847e-06
 4.39209072e-06 8.73023148e-06]
(document 0, word 1) 
 word-topic:[9.15080527e-05 9.20047842e-05 9.12575287e-05 9.27557740e-05
 9.28074246e-05 9.18779860e-05 9.27385700e-05 9.32487878e-05
 9.22339052e-05 9.16674306e-05]
 doc-topic[0.04761905 0.14285714 0.04761905 0.28571429 0.14285714 0.04761905
 0.0952381  0.04761905 0.04761905 0.0952381 ]
 word-prob[4.35752632e-06 1.31435406e-05 4.34559661e-06 2.65016497e-05
 1.32582035e-05 4.37514219e-06 8.83224476e-06 4.44041847e-06
 4.39209072e-06 8.73023148e-06]
(document 0, word 2) 
 word-topic:[9.15080527e-05 

(document 0, word 0) 
 word-topic:[7.89390590e-05 7.92581438e-05 7.86596397e-05 8.02632635e-05
 8.01410482e-05 7.93461874e-05 8.00000000e-05 8.00576415e-05
 7.95798186e-05 7.91702953e-05]
 doc-topic[0.23809524 0.14285714 0.04761905 0.0952381  0.04761905 0.04761905
 0.04761905 0.14285714 0.14285714 0.04761905]
 word-prob[1.87950141e-05 1.13225920e-05 3.74569713e-06 7.64412033e-06
 3.81624039e-06 3.77838988e-06 3.80952381e-06 1.14368059e-05
 1.13685455e-05 3.77001406e-06]
(document 0, word 1) 
 word-topic:[7.89390590e-05 7.92581438e-05 7.86596397e-05 8.02568218e-05
 8.01410482e-05 7.93461874e-05 8.00000000e-05 8.00576415e-05
 7.95798186e-05 7.91702953e-05]
 doc-topic[0.23809524 0.14285714 0.04761905 0.14285714 0.04761905 0.04761905
 0.04761905 0.0952381  0.14285714 0.04761905]
 word-prob[1.87950141e-05 1.13225920e-05 3.74569713e-06 1.14652603e-05
 3.81624039e-06 3.77838988e-06 3.80952381e-06 7.62453729e-06
 1.13685455e-05 3.77001406e-06]
(document 0, word 2) 
 word-topic:[7.89390590e-05 

(document 0, word 0) 
 word-topic:[6.94348007e-05 7.01606679e-05 6.96281855e-05 7.04076604e-05
 7.01065620e-05 6.97447343e-05 7.02345835e-05 7.00869078e-05
 7.01409834e-05 6.93721818e-05]
 doc-topic[0.0952381  0.0952381  0.0952381  0.0952381  0.04761905 0.19047619
 0.14285714 0.14285714 0.04761905 0.04761905]
 word-prob[6.61283816e-06 6.68196837e-06 6.63125576e-06 6.70549146e-06
 3.33840771e-06 1.32847113e-05 1.00335119e-05 1.00124154e-05
 3.34004683e-06 3.30343723e-06]
(document 0, word 1) 
 word-topic:[6.94348007e-05 7.01606679e-05 6.96281855e-05 7.04076604e-05
 7.01065620e-05 6.97447343e-05 7.02345835e-05 7.00819959e-05
 7.01409834e-05 6.93721818e-05]
 doc-topic[0.0952381  0.04761905 0.0952381  0.0952381  0.04761905 0.19047619
 0.14285714 0.19047619 0.04761905 0.04761905]
 word-prob[6.61283816e-06 3.34098419e-06 6.63125576e-06 6.70549146e-06
 3.33840771e-06 1.32847113e-05 1.00335119e-05 1.33489516e-05
 3.34004683e-06 3.30343723e-06]
(document 0, word 2) 
 word-topic:[6.94348007e-05 

(document 0, word 0) 
 word-topic:[6.18467438e-05 6.26370185e-05 6.25664769e-05 6.25938908e-05
 6.24843789e-05 6.20078130e-05 6.24921885e-05 6.28180162e-05
 6.24141805e-05 6.20848078e-05]
 doc-topic[0.28571429 0.04761905 0.04761905 0.0952381  0.19047619 0.0952381
 0.0952381  0.04761905 0.04761905 0.04761905]
 word-prob[1.76704982e-05 2.98271517e-06 2.97935604e-06 5.96132294e-06
 1.19017865e-05 5.90550600e-06 5.95163700e-06 2.99133411e-06
 2.97210383e-06 2.95641942e-06]
(document 0, word 1) 
 word-topic:[6.18467438e-05 6.26370185e-05 6.25664769e-05 6.25938908e-05
 6.24843789e-05 6.20078130e-05 6.24921885e-05 6.28180162e-05
 6.24141805e-05 6.20809536e-05]
 doc-topic[0.23809524 0.04761905 0.04761905 0.0952381  0.19047619 0.0952381
 0.0952381  0.04761905 0.04761905 0.0952381 ]
 word-prob[1.47254152e-05 2.98271517e-06 2.97935604e-06 5.96132294e-06
 1.19017865e-05 5.90550600e-06 5.95163700e-06 2.99133411e-06
 2.97210383e-06 5.91247177e-06]
(document 0, word 2) 
 word-topic:[6.18467438e-05 6.

(document 0, word 0) 
 word-topic:[5.57320403e-05 5.65514901e-05 5.63063063e-05 5.63443768e-05
 5.64461504e-05 5.63189908e-05 5.64588979e-05 5.67569102e-05
 5.65291125e-05 5.58004576e-05]
 doc-topic[0.0952381  0.0952381  0.04761905 0.0952381  0.19047619 0.04761905
 0.04761905 0.14285714 0.0952381  0.14285714]
 word-prob[5.30781337e-06 5.38585620e-06 2.68125268e-06 5.36613113e-06
 1.07516477e-05 2.68185670e-06 2.68851895e-06 8.10813002e-06
 5.38372500e-06 7.97149394e-06]
(document 0, word 1) 
 word-topic:[5.57289345e-05 5.65514901e-05 5.63063063e-05 5.63443768e-05
 5.64461504e-05 5.63189908e-05 5.64588979e-05 5.67569102e-05
 5.65291125e-05 5.58004576e-05]
 doc-topic[0.14285714 0.0952381  0.04761905 0.0952381  0.19047619 0.04761905
 0.04761905 0.14285714 0.04761905 0.14285714]
 word-prob[7.96127635e-06 5.38585620e-06 2.68125268e-06 5.36613113e-06
 1.07516477e-05 2.68185670e-06 2.68851895e-06 8.10813002e-06
 2.69186250e-06 7.97149394e-06]
(document 0, word 2) 
 word-topic:[5.57289345e-05 

(document 0, word 0) 
 word-topic:[5.05050505e-05 5.15410782e-05 5.14482688e-05 5.10673067e-05
 5.14482688e-05 5.16502247e-05 5.15517064e-05 5.16049128e-05
 5.17651931e-05 5.07382414e-05]
 doc-topic[0.0952381  0.0952381  0.0952381  0.19047619 0.04761905 0.14285714
 0.04761905 0.19047619 0.04761905 0.04761905]
 word-prob[4.81000481e-06 4.90867412e-06 4.89983512e-06 9.72710604e-06
 2.44991756e-06 7.37860353e-06 2.45484316e-06 9.82950720e-06
 2.46500919e-06 2.41610673e-06]
(document 0, word 1) 
 word-topic:[5.05050505e-05 5.15410782e-05 5.14482688e-05 5.10673067e-05
 5.14482688e-05 5.16475571e-05 5.15517064e-05 5.16049128e-05
 5.17651931e-05 5.07382414e-05]
 doc-topic[0.0952381  0.0952381  0.0952381  0.14285714 0.04761905 0.19047619
 0.04761905 0.19047619 0.04761905 0.04761905]
 word-prob[4.81000481e-06 4.90867412e-06 4.89983512e-06 7.29532953e-06
 2.44991756e-06 9.83762992e-06 2.45484316e-06 9.82950720e-06
 2.46500919e-06 2.41610673e-06]
(document 0, word 2) 
 word-topic:[5.05050505e-05 

(document 0, word 0) 
 word-topic:[4.65051388e-05 4.71787130e-05 4.73507268e-05 4.69968982e-05
 4.69043152e-05 4.75172250e-05 4.73574541e-05 4.73664267e-05
 4.75262583e-05 4.68033324e-05]
 doc-topic[0.0952381  0.0952381  0.0952381  0.0952381  0.0952381  0.04761905
 0.04761905 0.04761905 0.33333333 0.04761905]
 word-prob[4.42906084e-06 4.49321076e-06 4.50959303e-06 4.47589507e-06
 4.46707764e-06 2.26272500e-06 2.25511686e-06 2.25554413e-06
 1.58420861e-05 2.22873011e-06]
(document 0, word 1) 
 word-topic:[4.65051388e-05 4.71787130e-05 4.73507268e-05 4.69968982e-05
 4.69043152e-05 4.75172250e-05 4.73574541e-05 4.73664267e-05
 4.75262583e-05 4.68011419e-05]
 doc-topic[0.0952381  0.0952381  0.0952381  0.0952381  0.0952381  0.04761905
 0.04761905 0.04761905 0.28571429 0.0952381 ]
 word-prob[4.42906084e-06 4.49321076e-06 4.50959303e-06 4.47589507e-06
 4.46707764e-06 2.26272500e-06 2.25511686e-06 2.25554413e-06
 1.35789309e-05 4.45725161e-06]
(document 0, word 2) 
 word-topic:[4.65051388e-05 

(document 0, word 0) 
 word-topic:[4.31499461e-05 4.35009570e-05 4.39155066e-05 4.33557338e-05
 4.35066348e-05 4.39193640e-05 4.37483594e-05 4.37273165e-05
 4.38673451e-05 4.33012904e-05]
 doc-topic[0.0952381  0.23809524 0.14285714 0.0952381  0.04761905 0.0952381
 0.0952381  0.04761905 0.0952381  0.04761905]
 word-prob[4.10951867e-06 1.03573707e-05 6.27364380e-06 4.12911750e-06
 2.07174451e-06 4.18279658e-06 4.16651042e-06 2.08225316e-06
 4.17784240e-06 2.06196621e-06]
(document 0, word 1) 
 word-topic:[4.31499461e-05 4.34990648e-05 4.39155066e-05 4.33557338e-05
 4.35066348e-05 4.39193640e-05 4.37483594e-05 4.37273165e-05
 4.38673451e-05 4.33012904e-05]
 doc-topic[0.0952381  0.28571429 0.0952381  0.0952381  0.04761905 0.0952381
 0.0952381  0.04761905 0.0952381  0.04761905]
 word-prob[4.10951867e-06 1.24283042e-05 4.18242920e-06 4.12911750e-06
 2.07174451e-06 4.18279658e-06 4.16651042e-06 2.08225316e-06
 4.17784240e-06 2.06196621e-06]
(document 0, word 2) 
 word-topic:[4.31499461e-05 4.

(document 0, word 0) 
 word-topic:[4.02090873e-05 4.05087904e-05 4.06074880e-05 4.03795679e-05
 4.06206841e-05 4.08413314e-05 4.05975966e-05 4.07747197e-05
 4.06454497e-05 4.02673754e-05]
 doc-topic[0.0952381  0.04761905 0.04761905 0.23809524 0.0952381  0.0952381
 0.04761905 0.04761905 0.23809524 0.04761905]
 word-prob[3.82943688e-06 1.92899002e-06 1.93368991e-06 9.61418284e-06
 3.86863658e-06 3.88965061e-06 1.93321889e-06 1.94165332e-06
 9.67748803e-06 1.91749407e-06]
(document 0, word 1) 
 word-topic:[4.02090873e-05 4.05087904e-05 4.06074880e-05 4.03779375e-05
 4.06206841e-05 4.08413314e-05 4.05975966e-05 4.07747197e-05
 4.06454497e-05 4.02673754e-05]
 doc-topic[0.0952381  0.04761905 0.04761905 0.28571429 0.0952381  0.0952381
 0.04761905 0.04761905 0.19047619 0.04761905]
 word-prob[3.82943688e-06 1.92899002e-06 1.93368991e-06 1.15365536e-05
 3.86863658e-06 3.88965061e-06 1.93321889e-06 1.94165332e-06
 7.74199043e-06 1.91749407e-06]
(document 0, word 2) 
 word-topic:[4.02090873e-05 4.

(document 0, word 0) 
 word-topic:[3.76874953e-05 3.78558449e-05 3.78787879e-05 3.78644453e-05
 3.80952381e-05 3.80865326e-05 3.78214826e-05 3.79924775e-05
 3.79607486e-05 3.76676209e-05]
 doc-topic[0.14285714 0.04761905 0.0952381  0.14285714 0.04761905 0.19047619
 0.0952381  0.04761905 0.04761905 0.14285714]
 word-prob[5.38392790e-06 1.80265928e-06 3.60750361e-06 5.40920647e-06
 1.81405896e-06 7.25457764e-06 3.60204596e-06 1.80916559e-06
 1.80765469e-06 5.38108870e-06]
(document 0, word 1) 
 word-topic:[3.76874953e-05 3.78558449e-05 3.78787879e-05 3.78644453e-05
 3.80952381e-05 3.80865326e-05 3.78214826e-05 3.79924775e-05
 3.79607486e-05 3.76662021e-05]
 doc-topic[0.14285714 0.04761905 0.0952381  0.0952381  0.04761905 0.19047619
 0.0952381  0.04761905 0.04761905 0.19047619]
 word-prob[5.38392790e-06 1.80265928e-06 3.60750361e-06 3.60613765e-06
 1.81405896e-06 7.25457764e-06 3.60204596e-06 1.80916559e-06
 1.80765469e-06 7.17451469e-06]
(document 0, word 2) 
 word-topic:[3.76874953e-05 

(document 0, word 0) 
 word-topic:[3.53531783e-05 3.53969771e-05 3.55897217e-05 3.55783257e-05
 3.57551487e-05 3.57794554e-05 3.55644071e-05 3.56442702e-05
 3.55441814e-05 3.54283285e-05]
 doc-topic[0.14285714 0.04761905 0.04761905 0.19047619 0.14285714 0.23809524
 0.04761905 0.04761905 0.04761905 0.04761905]
 word-prob[5.05045404e-06 1.68557034e-06 1.69474865e-06 6.77682394e-06
 5.10787839e-06 8.51891796e-06 1.69354320e-06 1.69734620e-06
 1.69258007e-06 1.68706326e-06]
(document 0, word 1) 
 word-topic:[3.53531783e-05 3.53969771e-05 3.55897217e-05 3.55783257e-05
 3.57551487e-05 3.57781753e-05 3.55644071e-05 3.56442702e-05
 3.55441814e-05 3.54283285e-05]
 doc-topic[0.14285714 0.04761905 0.04761905 0.19047619 0.14285714 0.23809524
 0.04761905 0.04761905 0.04761905 0.04761905]
 word-prob[5.05045404e-06 1.68557034e-06 1.69474865e-06 6.77682394e-06
 5.10787839e-06 8.51861317e-06 1.69354320e-06 1.69734620e-06
 1.69258007e-06 1.68706326e-06]
(document 0, word 2) 
 word-topic:[3.53531783e-05 

(document 0, word 0) 
 word-topic:[3.33089068e-05 3.33522329e-05 3.34828902e-05 3.35064500e-05
 3.37131684e-05 3.36485077e-05 3.35109413e-05 3.36179654e-05
 3.34851326e-05 3.34246942e-05]
 doc-topic[0.04761905 0.04761905 0.04761905 0.04761905 0.19047619 0.28571429
 0.04761905 0.04761905 0.0952381  0.14285714]
 word-prob[1.58613842e-06 1.58820157e-06 1.59442334e-06 1.59554524e-06
 6.42155588e-06 9.61385934e-06 1.59575911e-06 1.60085550e-06
 3.18906025e-06 4.77495631e-06]
(document 0, word 1) 
 word-topic:[3.33089068e-05 3.33511206e-05 3.34828902e-05 3.35064500e-05
 3.37131684e-05 3.36485077e-05 3.35109413e-05 3.36179654e-05
 3.34851326e-05 3.34246942e-05]
 doc-topic[0.04761905 0.0952381  0.04761905 0.04761905 0.19047619 0.23809524
 0.04761905 0.04761905 0.0952381  0.14285714]
 word-prob[1.58613842e-06 3.17629720e-06 1.59442334e-06 1.59554524e-06
 6.42155588e-06 8.01154945e-06 1.59575911e-06 1.60085550e-06
 3.18906025e-06 4.77495631e-06]
(document 0, word 2) 
 word-topic:[3.33089068e-05 

(document 0, word 0) 
 word-topic:[3.14425858e-05 3.14514861e-05 3.15836018e-05 3.17571215e-05
 3.19366377e-05 3.17833646e-05 3.17762949e-05 3.17662008e-05
 3.16175541e-05 3.16085596e-05]
 doc-topic[0.14285714 0.04761905 0.14285714 0.04761905 0.04761905 0.23809524
 0.04761905 0.19047619 0.04761905 0.04761905]
 word-prob[4.49179798e-06 1.49768981e-06 4.51194311e-06 1.51224388e-06
 1.52079227e-06 7.56746776e-06 1.51315690e-06 6.05070491e-06
 1.50559781e-06 1.50516950e-06]
(document 0, word 1) 
 word-topic:[3.14425858e-05 3.14514861e-05 3.15826043e-05 3.17571215e-05
 3.19366377e-05 3.17833646e-05 3.17762949e-05 3.17662008e-05
 3.16175541e-05 3.16085596e-05]
 doc-topic[0.14285714 0.04761905 0.19047619 0.04761905 0.04761905 0.19047619
 0.04761905 0.19047619 0.04761905 0.04761905]
 word-prob[4.49179798e-06 1.49768981e-06 6.01573415e-06 1.51224388e-06
 1.52079227e-06 6.05397421e-06 1.51315690e-06 6.05070491e-06
 1.50559781e-06 1.50516950e-06]
(document 0, word 2) 
 word-topic:[3.14425858e-05 

In [17]:
df_tw = pd.DataFrame(tw_matrix)
df_tw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2195,2196,2197,2198,2199,2200,2201,2202,2203,2204
0,3563.0,3579.0,3462.0,3414.0,3459.0,3234.0,2778.0,2362.0,1709.0,1211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3558.0,3549.0,3444.0,3508.0,3386.0,3193.0,2861.0,2379.0,1717.0,1259.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3494.0,3453.0,3498.0,3447.0,3407.0,3223.0,2729.0,2277.0,1746.0,1297.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3530.0,3380.0,3498.0,3487.0,3337.0,3190.0,2673.0,2404.0,1699.0,1268.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3335.0,3511.0,3489.0,3369.0,3383.0,3101.0,2826.0,2367.0,1711.0,1225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
word_lists = []
for k in range(K):
    topic_k_words = df_tw.iloc[k, :].array
    #Get top 10 words for topic k
    top_words_ind = np.argpartition(topic_k_words, -10)[-10:]
    top_words = [vocab[v] for v in top_words_ind]
    word_lists.append(top_words)

In [19]:
df_tw.iloc[:, 2203].array, vocab[2203]

(<PandasArray>
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 Length: 10, dtype: float64, 'my')

In [20]:
word_lists

[['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'escalate',
  'sentiment',
  'up,',
  'hampshire',
  'brokerage'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'sentiment',
  'up,',
  'escalate',
  'brokerage',
  'hampshire'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'sentiment',
  'escalate',
  'brokerage',
  'up,',
  'hampshire'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'sentiment',
  'brokerage',
  'escalate',
  'up,',
  'hampshire'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'hampshire',
  'sentiment',
  'escalate',
  'up,',
  'brokerage'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'sentiment',
  'hampshire',
  'escalate',
  'up,',
  'brokerage'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'sentiment',
  'escalate',
  'up,',
  'brokerage',
  'hampshire'],
 ['harlem',
  'blog',
  'bought',
  'manager',
  'anniversary',
  'se