In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mnoordeen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mnoordeen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [12]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['personalize', 'pocket', 'directory', 'mobile', 'devices']
['write', 'optimize', 'tree']
['revisit', 'absolutely', 'minimal', 'realization', 'dimensional', 'digital', 'filter']
['neural', 'stimulator', 'capable', 'deliver', 'constant', 'current', 'current', 'stimulus', 'charge', 'voltage']
['searchmobil', 'viewing', 'search', 'mobile', 'devices']
['efficient', 'mining', 'frequent', 'sequence', 'generator']
['journey', 'cystic', 'fibrosis']
['trichromatic', 'approximation', 'computer', 'graphics', 'illumination', 'model']
['large', 'scale', 'collection', 'coordinate', 'approach']
['padding', 'orthogonality', 'efficient', 'subspace', 'authentication', 'network', 'coding']
['design', 'multi', 'butterfly', 'chaotic', 'attractor', 'piecewise', 'l&uuml', 'system', 'base', 'switching', 'control', 'heteroclinic', 'orbit']
['cantina', 'content', 'base', 'approach', 'detecting', 'phishing', 'site']
['propel', 'distribute', 'services', 'platform']
['property', 'random', 'direction', 'model']
['s

In [13]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [14]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [15]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [16]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [17]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"model" + 0.027*"complex" + 0.027*"graphics" + 0.027*"approximation"')
(1, '0.045*"devices" + 0.045*"mobile" + 0.025*"speed" + 0.025*"sequence"')
(2, '0.018*"design" + 0.018*"system" + 0.018*"heteroclinic" + 0.018*"piecewise"')
(3, '0.029*"efficient" + 0.029*"approach" + 0.029*"base" + 0.029*"subspace"')
(4, '0.045*"current" + 0.045*"surface" + 0.025*"system" + 0.025*"neural"')


In [18]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"complex" + 0.026*"efficient" + 0.015*"mobile" + 0.015*"analysis"')
(1, '0.022*"model" + 0.022*"surface" + 0.022*"system" + 0.022*"design"')
(2, '0.023*"devices" + 0.023*"base" + 0.023*"current" + 0.013*"approach"')


In [20]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.066*"base" + 0.034*"approach" + 0.034*"devices" + 0.034*"detecting"')
(1, '0.069*"services" + 0.069*"distribute" + 0.069*"propel" + 0.069*"platform"')
(2, '0.053*"efficient" + 0.028*"antialiased" + 0.028*"coordinate" + 0.028*"directory"')
(3, '0.050*"rank" + 0.050*"retrieval" + 0.050*"fibrosis" + 0.050*"engine"')
(4, '0.037*"mobile" + 0.037*"devices" + 0.037*"methodology" + 0.037*"transformer"')
(5, '0.028*"design" + 0.028*"system" + 0.028*"minimal" + 0.028*"switching"')
(6, '0.069*"signal" + 0.069*"challenge" + 0.069*"satellite" + 0.069*"navigation"')
(7, '0.048*"model" + 0.048*"complex" + 0.048*"trichromatic" + 0.048*"graphics"')
(8, '0.096*"current" + 0.050*"neural" + 0.050*"stimulator" + 0.050*"constant"')
(9, '0.008*"base" + 0.008*"model" + 0.008*"design" + 0.008*"challenge"')


### pyLDAvis

In [21]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [22]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [23]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [24]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
