In [41]:
import numpy as np
import lda
import lda.datasets
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
print(X.shape,X.sum(), "vocab:", len(vocab), "titles:", len(titles))


(395, 4258) 84010 vocab: 4258 titles: 395


In [42]:
model = lda.LDA(n_topics=100, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 100
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1258788
INFO:lda:<10> log likelihood: -734487
INFO:lda:<20> log likelihood: -708068
INFO:lda:<30> log likelihood: -698147
INFO:lda:<40> log likelihood: -691363
INFO:lda:<50> log likelihood: -687491
INFO:lda:<60> log likelihood: -684165
INFO:lda:<70> log likelihood: -682336
INFO:lda:<80> log likelihood: -680400
INFO:lda:<90> log likelihood: -678905
INFO:lda:<100> log likelihood: -676797
INFO:lda:<110> log likelihood: -675647
INFO:lda:<120> log likelihood: -675180
INFO:lda:<130> log likelihood: -674108
INFO:lda:<140> log likelihood: -673157
INFO:lda:<150> log likelihood: -673066
INFO:lda:<160> log likelihood: -672008
INFO:lda:<170> log likelihood: -671173
INFO:lda:<180> log likelihood: -671111
INFO:lda:<190> log likelihood: -670026
INFO:lda:<200> log likelihood: -669592
INFO:lda:<210> log likelihood: -668796
INFO:lda:<220> log likelihood: 

Topic 0: buddhist vietnam cancer national friday local dent ceremony
Topic 1: christmas year state days day time church held
Topic 2: social party major leader leadership beetham credit since
Topic 3: people n't own years world added end right
Topic 4: political union including leader took leading battle soldiers
Topic 5: hwang ramos estrada philippines state manila north korean
Topic 6: million sale went estate estimated auction sotheby former
Topic 7: president ceremony former thursday house honour senate tsongas
Topic 8: wallace including city people crowd procession mayor hundreds
Topic 9: doctors home condition hospital sen prayers tuesday nursing
Topic 10: yeltsin kremlin operation president heart russian chernomyrdin russia
Topic 11: art century exhibition works show artists visitors great
Topic 12: church economic reforms independent labour leaders police law
Topic 13: first star n't singer madonna child appeared father
Topic 14: harriman u.s churchill ambassador paris pamela f

In [10]:
doc_topic = model.doc_topic_
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20 (top topic: 66)
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21 (top topic: 83)
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23 (top topic: 79)
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25 (top topic: 29)
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25 (top topic: 79)
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25 (top topic: 79)
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26 (top topic: 79)
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25 (top topic: 79)
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26 (top topic: 79)
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26 (top topic: 29)


In [104]:
import json
with open('ac2-combined-ideas.json', 'r') as f:
    ac1_dict = json.load(f)["@graph"]
    content_array = list(map(lambda idea: idea["content"], ac1_dict));


#with open('ac2-abstract-ideas.json', 'r') as f:
#    ac2_dict = json.load(f)["@graph"]
#content_array += list(map(lambda idea: idea["content"], ac2_dict));
#print(content_array)


In [125]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = CountVectorizer(stop_words="english", max_df=0.1)
print(len(vectorizer.get_stop_words()))
X = vectorizer.fit_transform(content_array)
vocab = vectorizer.get_feature_names()
print("nr_ideas:",len(content_array),"vocab:",len(vocab), X.shape)

318
nr_ideas: 419 vocab: 1593 (419, 1593)


In [128]:
new_model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
new_model.fit(X)  # model.fit_transform(X) is also available
topic_word = new_model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 419
INFO:lda:vocab_size: 1593
INFO:lda:n_words: 3908
INFO:lda:n_topics: 50
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -50485
INFO:lda:<10> log likelihood: -35802
INFO:lda:<20> log likelihood: -35102
INFO:lda:<30> log likelihood: -35001
INFO:lda:<40> log likelihood: -34768
INFO:lda:<50> log likelihood: -34605
INFO:lda:<60> log likelihood: -34724
INFO:lda:<70> log likelihood: -34637
INFO:lda:<80> log likelihood: -34588
INFO:lda:<90> log likelihood: -34559
INFO:lda:<100> log likelihood: -34513
INFO:lda:<110> log likelihood: -34551
INFO:lda:<120> log likelihood: -34830
INFO:lda:<130> log likelihood: -34486
INFO:lda:<140> log likelihood: -34350
INFO:lda:<150> log likelihood: -34345
INFO:lda:<160> log likelihood: -34533
INFO:lda:<170> log likelihood: -34552
INFO:lda:<180> log likelihood: -34442
INFO:lda:<190> log likelihood: -34525
INFO:lda:<200> log likelihood: -34450
INFO:lda:<210> log likelihood: -34581
INFO:lda:<220> log likelihood: -34645
INFO:lda:<230> log

Topic 0: computer place cops say debris speeding keyboard miners
Topic 1: conditions lost able play size shape tennis vehicles
Topic 2: person like make safety language future based living
Topic 3: patterns athletes sports analyze performance coaches analyzing differences
Topic 4: large areas data new life gather water crowd
Topic 5: determine animals patients health enemies condition suspicious sure
Topic 6: track road able far inside attention deaf need
Topic 7: night outside intruder owner application dog right away
Topic 8: certain way example human cameras combat settings experience
Topic 9: track useful behavior conditions crime piece potential active
Topic 10: sensor turn improve change automatically example lights vehicles
Topic 11: map inside tiny cave hand structure infestations oxygen
Topic 12: camera weather control record vehicle traffic bad phone
Topic 13: perfect surgery looking swing determine hand park monitoring
Topic 14: different navigate robot look feedback compari

In [180]:
doc_topic = new_model.doc_topic_
for i in range(10):
    print(np.sort(doc_topic[i])[::-1][:3])
    arr = np.arange(len(doc_topic))[np.argsort(doc_topic[i])][::-1][:3]
    print("{} (top topic: {})".format(content_array[i], arr))

[0.2625 0.1375 0.0125]
The technology can be used to monitor air craft. (top topic: [15 11 12])
[0.21315789 0.13421053 0.13421053]
the device could be used in a detective manner that is to say that it could be useful to predicting behavioral patterns of say criminal offenders, students, to perform research perhaps by tracking the physical lives of athletes or top achieving businessman or for companies to isolate expected behaviors of their employees and how they anticipate employees to move during a work day to establish fair and true standards (top topic: [36  9 41])
[0.26764706 0.20882353 0.15      ]
An application could be created for pet owners who have pets who have ran away from home. The application could be downloaded right onto a persons phone. the hand sized device could be put into a collar of a pet, or a chip like device under the skin. The system would be able to pin point a pets location to the direct coordinates.  (top topic: [37 47  7])
[0.28181818 0.19090909 0.1       