In [86]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer

def clean(text):
    t = text.lower().strip()
    t = t.split()
    t = remove_stop_words(t)
    t = [get_lemma(w) for w in t]
    t = [get_stem(w) for w in t]
    return t

def get_stem(w):
    return PorterStemmer().stem(w)
    
def get_lemma(w):
    lemma = wn.morphy(w)
    return w if lemma is None else lemma
    
def remove_stop_words(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]


texts = [
    'The Art of Computer Programming',
    'Computer Programming Learn Any Programming Language In 2 Hours',
    'The Self-Taught Programmer The Definitive Guide to Programming Professionally',
    'The Complete Software Developers Career Guide How to Learn Your Next Programming Language',
    'Cracking the Coding Interview 189 Programming Questions and Solutions',
    'The Economics Book Big Ideas Simply Explained',
    'Economics in One Lesson The Shortest and Surest Way to Understand Basic Economics',
    'Basic Economics',
    'Aftermath Seven Secrets of Wealth Preservation in the Coming Chaos',
    'Economics 101 From Consumer Behavior to Competitive Markets Everything You Need to Know About Economics'
]

texts = [clean(t) for t in texts]

In [87]:
texts

[['art', 'comput', 'program'],
 ['comput', 'program', 'learn', 'program', 'languag', '2', 'hour'],
 ['self-taught', 'programm', 'definit', 'guid', 'program', 'profession'],
 ['complet',
  'softwar',
  'develop',
  'career',
  'guid',
  'learn',
  'next',
  'program',
  'languag'],
 ['crack', 'code', 'interview', '189', 'program', 'question', 'solut'],
 ['econom', 'book', 'big', 'idea', 'simpli', 'explain'],
 ['econom',
  'one',
  'lesson',
  'short',
  'sure',
  'way',
  'understand',
  'basic',
  'econom'],
 ['basic', 'econom'],
 ['aftermath', 'seven', 'secret', 'wealth', 'preserv', 'come', 'chao'],
 ['econom',
  '101',
  'consum',
  'behavior',
  'competit',
  'market',
  'everyth',
  'need',
  'know',
  'econom']]

In [88]:
from gensim import corpora
from gensim.models import LdaModel

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3)
corpus = [dictionary.doc2bow(text) for text in texts]

lda = LdaModel(corpus, 
               id2word=dictionary, 
               num_topics=2, 
               random_state=37, 
               iterations=100,
               passes=20,
               per_word_topics=False)
corpus_lda = lda[corpus]
print(lda.get_topics().shape)

(2, 2)


In [89]:
lda.print_topics()

[(0, '0.926*"econom" + 0.074*"program"'),
 (1, '0.926*"program" + 0.074*"econom"')]

In [90]:
lda[dictionary.doc2bow(clean('Naked Economics Undressing the Dismal Science'))]

[(0, 0.74824655), (1, 0.25175342)]

In [91]:
lda[dictionary.doc2bow(clean('Elements of Programming Interviews in Python The Insiders Guide'))]

[(0, 0.25178012), (1, 0.74821985)]

In [92]:
for d in corpus_lda:
    print(d)

[(0, 0.25178382), (1, 0.74821615)]
[(0, 0.16788752), (1, 0.8321125)]
[(0, 0.25178117), (1, 0.74821883)]
[(0, 0.25178203), (1, 0.748218)]
[(0, 0.25178233), (1, 0.7482177)]
[(0, 0.7482476), (1, 0.25175238)]
[(0, 0.8321306), (1, 0.16786939)]
[(0, 0.74824744), (1, 0.25175253)]
[(0, 0.5), (1, 0.5)]
[(0, 0.83212996), (1, 0.16787006)]


In [93]:
import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  from collections import Sized
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
