In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/pavan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [6]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/pavan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['optimal', 'waveband', 'switching', 'optical', 'network']
['feature', 'base', 'cellular', 'texturing', 'architectural', 'model']
['optimization', 'method', 'joint', 'allocation', 'modulation', 'scheme', 'coding', 'rates', 'resource', 'block', 'power', 'organize', 'network']
['light', 'weight', 'crypto', 'algorithm']
['market', 'framework', 'inform', 'trading']
['millimeter', 'cross', 'layer', 'modeling', 'multi', 'architecture']
['signing', 'individual', 'fragment', 'graph']
['track', 'personal', 'olfactory', 'display']
['measurement', 'bacterial', 'activity', 'using', 'array', 'base', 'isfet', 'chemical', 'current', 'conveyor', 'inversion']
['comic_strip', 'seeing', 'walls</i']
['adapt', 'dynamic', 'service', 'composition']
['voltage', 'linear', 'programmable', 'triode', 'transconductor']
['hybrid', 'method', 'photovoltaic', 'system', 'estimation', 'revision', 'method']
['7-decades', 'tunable', 'translinear', 'bicmos', '3-phase', 'sinusoidal', 'oscillator']
['mobile', 'computing', 'f

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [10]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [11]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [12]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [13]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.033*"network" + 0.018*"using" + 0.018*"isfet" + 0.018*"bacterial"')
(1, '0.034*"base" + 0.034*"algorithm" + 0.034*"method" + 0.034*"approach"')
(2, '0.031*"mobile" + 0.031*"voltage" + 0.031*"panel" + 0.031*"research"')
(3, '0.064*"network" + 0.034*"control" + 0.018*"dynamic" + 0.018*"design"')
(4, '0.033*"modeling" + 0.033*"network" + 0.018*"optical" + 0.018*"bicmos"')


In [14]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(17, 1), (23, 1)]
[(0, 0.39856187), (1, 0.40028772), (2, 0.06670511), (3, 0.06775606), (4, 0.066689275)]


In [15]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.039*"network" + 0.030*"method" + 0.021*"power" + 0.021*"modeling"')
(1, '0.014*"using" + 0.014*"approach" + 0.014*"system" + 0.014*"tunable"')
(2, '0.035*"network" + 0.025*"base" + 0.014*"control" + 0.014*"dynamic"')


In [16]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.054*"method" + 0.028*"system" + 0.028*"hybrid" + 0.028*"revision"')
(1, '0.053*"tunable" + 0.053*"sinusoidal" + 0.053*"translinear" + 0.053*"oscillator"')
(2, '0.046*"network" + 0.046*"mobile" + 0.024*"power" + 0.024*"coding"')
(3, '0.007*"network" + 0.007*"seeing" + 0.007*"base" + 0.007*"algorithm"')
(4, '0.036*"modeling" + 0.036*"multi" + 0.036*"millimeter" + 0.036*"cross"')
(5, '0.054*"algorithm" + 0.028*"approach" + 0.028*"dynamic" + 0.028*"adaptive"')
(6, '0.072*"network" + 0.049*"base" + 0.049*"optical" + 0.026*"voltage"')
(7, '0.066*"network" + 0.066*"control" + 0.035*"design" + 0.035*"inverse"')
(8, '0.007*"network" + 0.007*"algorithm" + 0.007*"dynamic" + 0.007*"base"')
(9, '0.044*"base" + 0.044*"using" + 0.044*"isfet" + 0.044*"current"')


### pyLDAvis

In [17]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [18]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  from pandas import hashtable, tslib, lib


In [19]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [20]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)