In [3]:
import numpy as np
import scipy
from scipy.stats import dirichlet, multinomial
import pandas as pd
import traceback
from matplotlib import pyplot as plt
import re
from nltk.corpus import stopwords

In [4]:
#!jt -t gruvboxd
#!jt -t gruvboxd -T -N
stops = stopwords.words('english')

In [5]:
df_data = pd.read_csv('uci-news-aggregator.csv')
df_data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [6]:
def preprocess(token: str):
    '''
    Function that formats and strips each word of junk characters and removes stopwords
    '''
    return re.sub(r'[^a-zA-Z\s-]','',token.lower())

In [7]:
df_data.iloc[:10000,1].astype(str)[:10]

0    Fed official says weak data caused by weather,...
1    Fed's Charles Plosser sees high bar for change...
2    US open: Stocks fall after Fed official hints ...
3    Fed risks falling 'behind the curve', Charles ...
4    Fed's Plosser: Nasty Weather Has Curbed Job Gr...
5    Plosser: Fed May Have to Accelerate Tapering Pace
6            Fed's Plosser: Taper pace may be too slow
7    Fed's Plosser expects US unemployment to fall ...
8    US jobs growth last month hit by weather:Fed P...
9    ECB unlikely to end sterilisation of SMP purch...
Name: TITLE, dtype: object

In [9]:
raw_documents = df_data.iloc[:20000,1].astype(str).apply(lambda x: preprocess(x)).to_numpy()
stops = set(stopwords.words('english'))
for r in range(len(raw_documents)):
        words = raw_documents[r].split(' ')
        words = [w for w in words if w not in stops]
        raw_documents[r] = ' '.join(words)
raw_documents[0:10]

array(['fed official says weak data caused weather slow taper',
       'feds charles plosser sees high bar change pace tapering',
       'us open stocks fall fed official hints accelerated tapering',
       'fed risks falling behind curve charles plosser says',
       'feds plosser nasty weather curbed job growth',
       'plosser fed may accelerate tapering pace',
       'feds plosser taper pace may slow',
       'feds plosser expects us unemployment fall  end ',
       'us jobs growth last month hit weatherfed president charles plosser',
       'ecb unlikely end sterilisation smp purchases - traders'],
      dtype=object)

In [10]:
#Get first 10K documents
#raw_documents = raw_documents[:10000]
#raw_documents
raw_documents.shape

(20000,)

In [11]:
docs = [d.split() for d in raw_documents]
vocab = list(set(' '.join(raw_documents).split()))
vocab[:10],len(vocab)

(['diplomacy',
  'replacing',
  'sizing',
  'refer',
  'mp',
  'empowerment',
  'wake',
  'fidelity',
  'brake',
  'waive'],
 12941)

In [12]:
#create word ids
mapped_docs = []
longest_doc_length = 0
for doc in docs:
    new_doc = []
    vectorized_doc = doc
    doc_len = len(doc)
    for i in range(doc_len):
        vectorized_doc[i] = vocab.index(doc[i])
    longest_doc_length = max(longest_doc_length, len(vectorized_doc))
    mapped_docs.append(vectorized_doc)
len(mapped_docs), mapped_docs[:5]

(20000,
 [[5241, 2267, 8862, 5964, 5615, 1895, 3357, 3260, 6032],
  [4283, 9709, 3700, 6959, 8967, 1120, 7120, 2123, 3104],
  [2732, 10914, 6233, 8244, 5241, 2267, 3308, 4404, 3104],
  [5241, 9930, 5094, 9278, 3788, 9709, 3700, 8862],
  [4283, 3700, 7740, 3357, 4446, 1533, 5313]])

In [13]:
longest_doc_length

15

In [14]:
#Number of topics
K = 50
num_iterations = 50
#topic-word matrix
tw_matrix = np.zeros((K,len(vocab)))

#topic assignment history

assignments = np.zeros((len(mapped_docs), longest_doc_length, num_iterations+1 ), dtype=int)


#document-topic matrix
dt_matrix = np.zeros((len(docs),K))


In [15]:
#Randomly intitialize
for d in range(len(docs)):
    for w in range(len(mapped_docs[d])):
        ti = np.random.randint(0,K)
        assignments[d,w,0] = ti
        wi = int(mapped_docs[d][w])
        tw_matrix[ti, wi] += 1
        dt_matrix[d,ti] += 1
    

In [13]:
#pd.DataFrame(tw_matrix)

In [14]:
#pd.DataFrame(dt_matrix)

In [16]:
#Model paramters
alpha = 1
eta = 1

In [17]:
#calculating P(z_i|*)
for iteration in range(num_iterations):    
    print(f'{iteration}/{num_iterations}')
    for d_i in range(len(mapped_docs)):
        for w_i in range(len(mapped_docs[d_i])):
            init_topic = int(assignments[d_i, w_i, iteration])
            #print(init_topic == 0)
            word_id = mapped_docs[d_i][w_i] 
            #z_-i term
            dt_matrix[d_i, init_topic] -= 1
            tw_matrix[init_topic, word_id] -= 1
            #word topic means
            wt_means = (tw_matrix[:, word_id] + eta) / (tw_matrix.sum(axis=1) + len(vocab)*eta)
            dt_means = (dt_matrix[d_i,:]+alpha) / (dt_matrix[d_i,:].sum() + K*alpha )
            probs = wt_means*dt_means
            #Normalize, necessary due to rounding errors
            probs = probs/probs.sum()

            #Multinomial draws
            new_topic = np.argmax(np.random.multinomial(1,probs))
            dt_matrix[d_i, new_topic] += 1
            tw_matrix[new_topic, word_id] += 1
            #update topic assignment list
            assignments[d_i,w_i, iteration+1] = new_topic
            #if new_topic != init_topic:
            #    print(f'{init_topic} -> {new_topic}')


                
                

0/50
1/50
2/50
3/50
4/50
5/50
6/50
7/50
8/50
9/50
10/50
11/50
12/50
13/50
14/50
15/50
16/50
17/50
18/50
19/50
20/50
21/50
22/50
23/50
24/50
25/50
26/50
27/50
28/50
29/50
30/50
31/50
32/50
33/50
34/50
35/50
36/50
37/50
38/50
39/50
40/50
41/50
42/50
43/50
44/50
45/50
46/50
47/50
48/50
49/50


In [19]:
df_tw = pd.DataFrame(tw_matrix)
df_dt = pd.DataFrame(dt_matrix)
df_tw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12931,12932,12933,12934,12935,12936,12937,12938,12939,12940
0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_dt.head()

In [20]:
word_lists = []
for k in range(K):
    topic_k_words = df_tw.iloc[k, :].array
    #Get top 10 words for topic k
    top_words_ind = np.argpartition(topic_k_words, -10)[-10:]
    top_words = [vocab[v] for v in top_words_ind]
    word_lists.append(top_words)

In [33]:
[f'topic {i}:'+', '.join(word_lists[i]) for i in range(len(word_lists))]

['topic 0:billion, noah, new, finale, candy, china, bachelor, gold, mortgage, -',
 'topic 1:gas, chiquita, ukraine, may, video, new, cancer, bieber, sxsw, chinese',
 'topic 2:faces, million, snowden, test, bieber, rates, thrones, justin, true, gm',
 'topic 3:starbucks, bank, selena, google, shows, gm, plane, thrones, sxsw, justin',
 'topic 4:sxsw, gold, titanfall, bitcoin, stocks, snowden, true, gm, test, recall',
 'topic 5:miley, chris, watch, new, first, google, xbox, -, video, season',
 'topic 6:growth, last, one, weak, launch, new, bachelor, bieber, data, gm',
 'topic 7:-, ipo, ban, jos, may, new, us, watch, company, ukraine',
 'topic 8:ukraine, dunham, titanfall, growth, china, bank, season, lena, us, xbox',
 'topic 9:day, live, -, new, china, bitcoin, bieber, trailer, may, us',
 'topic 10:-, company, video, selena, sxsw, bieber, titanfall, justin, gomez, new',
 'topic 11:cyrus, cosmos, back, sxsw, game, neil, makes, clooneys, -, china',
 'topic 12:juan, bankruptcy, stacy, mortgag

In [32]:
tw_matrix[0,:]

array([0., 0., 0., ..., 0., 0., 0.])