In [1]:
import gensim
from gensim.corpora import Dictionary
from gensim import corpora, models

In [3]:
docs = [
    ['cat', 'dog', 'possum', 'wolf', 'rat'],  # topic 0
    ['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'],  # topic 0
    ['tuna', 'whale', 'shark', 'salmon', 'stringray'],  # topic 1
    ['tuna', 'shark', 'salmon', 'eel', 'stingray'],  # topic 1
    ['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'],  # topic 2
    ['pidgeon', 'crow', 'raven', 'parrot', 'eagle']   # topic 2
]

In [4]:
dct = Dictionary(docs)
print(' '.join('%s'%(i,) for i in dct.items()))

(0, 'cat') (1, 'dog') (2, 'possum') (3, 'rat') (4, 'wolf') (5, 'fox') (6, 'rabbit') (7, 'salmon') (8, 'shark') (9, 'stringray') (10, 'tuna') (11, 'whale') (12, 'eel') (13, 'stingray') (14, 'crow') (15, 'hawk') (16, 'parrot') (17, 'pidgeon') (18, 'sparrow') (19, 'eagle') (20, 'raven')


In [5]:
docs_bow = [dct.doc2bow(doc) for doc in docs]
docs_bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(7, 1), (8, 1), (10, 1), (12, 1), (13, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(14, 1), (16, 1), (17, 1), (19, 1), (20, 1)]]

In [6]:
lda_model = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct)

In [13]:
# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.05577975), (1, 0.88824743), (2, 0.05597283)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.04783169), (1, 0.90415514), (2, 0.048013188)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.05868661), (1, 0.05589025), (2, 0.8854231)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.8647225), (1, 0.05618122), (2, 0.07909632)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.8878725), (1, 0.055951286), (2, 0.05617614)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.8878262), (1, 0.055959612), (2, 0.056214225)]


In [7]:
lda_model.print_topics()

[(0,
  '0.105*"parrot" + 0.104*"crow" + 0.104*"pidgeon" + 0.061*"salmon" + 0.061*"shark" + 0.060*"tuna" + 0.060*"hawk" + 0.060*"stingray" + 0.060*"sparrow" + 0.060*"eel"'),
 (1,
  '0.127*"cat" + 0.127*"possum" + 0.127*"wolf" + 0.126*"rat" + 0.072*"rabbit" + 0.072*"dog" + 0.072*"fox" + 0.021*"tuna" + 0.020*"salmon" + 0.020*"shark"'),
 (2,
  '0.108*"tuna" + 0.108*"shark" + 0.107*"salmon" + 0.107*"whale" + 0.107*"stringray" + 0.030*"crow" + 0.030*"pidgeon" + 0.030*"rat" + 0.030*"raven" + 0.029*"wolf"')]

In [14]:
# Try TFIDF
tfidf = models.TfidfModel(docs_bow)
docs_tfidf = tfidf[docs_bow]
tfidf_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=3, id2word=dct)

In [15]:
# Finds 'birds' (topic 0) and 'animals' (topic 1), but wrongly assigns one of the 'sea creatures' doc to 'animals'.
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', tfidf_model[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.056555387), (1, 0.88662046), (2, 0.056824137)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.04854632), (1, 0.9026757), (2, 0.048777964)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.057446647), (1, 0.87425447), (2, 0.06829886)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.056884717), (1, 0.060229216), (2, 0.8828861)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.8869137), (1, 0.056311637), (2, 0.05677466)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.8869467), (1, 0.05627355), (2, 0.056779798)]


In [11]:
tfidf_model.print_topics()  # Looks better

[(0,
  '0.073*"shark" + 0.072*"tuna" + 0.072*"salmon" + 0.065*"stingray" + 0.065*"raven" + 0.064*"stringray" + 0.064*"eagle" + 0.064*"eel" + 0.063*"whale" + 0.051*"crow"'),
 (1,
  '0.097*"dog" + 0.075*"cat" + 0.074*"wolf" + 0.074*"possum" + 0.073*"rat" + 0.040*"salmon" + 0.039*"tuna" + 0.039*"parrot" + 0.038*"crow" + 0.038*"whale"'),
 (2,
  '0.075*"sparrow" + 0.072*"fox" + 0.072*"hawk" + 0.070*"rabbit" + 0.058*"parrot" + 0.058*"pidgeon" + 0.057*"crow" + 0.056*"cat" + 0.056*"rat" + 0.056*"wolf"')]