# Topic Modeling
---
#### Imports

In [1]:
# processing
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt



#### Processing

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("blacklist.csv", 'r')))]

inPath = "input.csv"
outPath = "out.csv"
wordBound = 10
charBound = 70

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

outFile = open(outPath, 'w')
outWriter = csv.writer(outFile)


docTokens = dict()


next(inReader)
for inRow in inReader:

    charDist = int(inRow[0])
    wordDist = int(inRow[1])

    if wordDist < wordBound and charDist < charBound:

        #predTerm, subTerm, objTerm = map(methodcaller("split", ":"), inRow[2:5])
        #allTerms = predTerm + subTerm + objTerm

        subTerm, objTerm = map(methodcaller("split", ":"), inRow[3:5])
        
        sub = "_".join([t for t in subTerm if re.match(r'[^\W\d]*$', t) and not t in blacklist])
        obj = "_".join([t for t in subTerm if re.match(r'[^\W\d]*$', t) and not t in blacklist])
        
        tokens = list()
        if not sub in blacklist and not obj in blacklist and len(sub) > 0 and len(obj) > 0:
            tokens = [sub, obj]

            docID = inRow[5]

            if docID in docTokens:
                docTokens[docID] += tokens
            else:
                docTokens[docID] = tokens

data = list(docTokens.values())

#### Model

In [3]:
id2word = corpora.Dictionary(data)
texts = data

corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=40, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=20,
                                       passes=20,
                                       alpha='auto',
                                       per_word_topics=True)


pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(21,
  '0.195*"molecular_hydrogen" + 0.065*"one_solution" + 0.060*"inversion_dimer" '
  '+ 0.034*"diffraction_spot" + 0.034*"basal_diffraction_spot_intensity" + '
  '0.034*"cone_series" + 0.034*"basal_diffraction_spot" + '
  '0.028*"piperazine_ring" + 0.020*"double_bond" + 0.020*"c_double_bond"'),
 (36,
  '0.124*"structure_solution" + 0.101*"water_molecule" + '
  '0.062*"molecular_skeleton" + 0.062*"water_molecule_involve_n_hydrogen_bond" '
  '+ 0.038*"vv_atom" + 0.038*"molecular_salt" + 0.038*"basal_plane" + '
  '0.025*"octahedral_coordination" + 0.019*"title_complete_cation" + '
  '0.019*"strongly_distort_octahedron"'),
 (19,
  '0.078*"kda_protein" + 0.078*"da_minus" + 0.052*"sluggish_transformation" + '
  '0.052*"vm_value" + 0.052*"obtain_agreement" + 0.027*"cuii_atom" + '
  '0.026*"atp_complex_erk_detail_structural_analysis" + '
  '0.026*"detail_structural_analysis" + 0.026*"dihydroorotate_dehydrogenase" + '
  '0.026*"putative_dihydroorotate_dehydrogenase"'),
 (6,
  '0.101*"import

#### Visualize

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [03/Jul/2019 14:58:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 14:58:03] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 14:58:03] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [03/Jul/2019 14:58:03] "GET /LDAvis.js HTTP/1.1" 200 -
