LDA Topic Modeling
---

Preforming LDA topic modeling with R&R terms.

### Imports

In [1]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math
import itertools
import sqlite3
import copy

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import HdpModel
from gensim.models import TfidfModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 




### Preprocessing

Read in the output from the R&R program. Tokenize the terms and remove blacklisted tokens. Build a corpora.

In [2]:
# path to blacklisted tokens
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]

# levels of R&R terms considered
levels = [1, 2, 3]

# format [term, orig, sentence, docID]
inPath = "raw.csv"
inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

In [3]:
docTokens = dict()

# ignore headers
next(inReader)

for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    # find acceptable tokens only
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])

    # calculate new term level
    level = token.count("_")

    # if acceptable, add to dictionary
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

# compile all IDs and texts
docIDs = list(docTokens.keys())
texts = list(docTokens.values())

In [5]:
# build a dictionary for the text
dictionary = corpora.Dictionary(texts)
print("Raw dictionary: " + str(len(dictionary)) + " terms")

# filter out terms that appear in fewer that LOW docs or greater than HIGH percent of docs and use KEEP terms
LOW = 1
HIGH = 0.14
KEEP = 20000

old_dict = copy.copy(dictionary)

dictionary.filter_extremes(no_below = LOW, no_above = HIGH, keep_n = KEEP)
print("Filtered dictionary: " + str(len(dictionary)) + " terms")

for word in old_dict.values():
    if not word in dictionary.values():
        print(word)

# convert the text to a corpus with the dictionary
corpus = [dictionary.doc2bow(text) for text in texts]

# perform TF-IDF on the corpus
tfidf = TfidfModel(corpus)
corpus = tfidf[corpus]

Raw dictionary: 16391 terms
Filtered dictionary: 16386 terms
crystal_structure
hydrogen_bond
dihedral_angle
n_h
c_h


### Modeling

Standard LDA modeling. Toggle the number of topics, passes, and iterations.

In [6]:
def model(corpus, numtop, passes, iterations):

    # run LDA with four cores
    model = gensim.models.LdaMulticore(corpus, 
                                       num_topics=numtop, 
                                       id2word=dictionary, 
                                       passes=passes, 
                                       workers=4,
                                       iterations=iterations)

    return model    

Perform a set of trials and write results to files.

In [48]:
# folder to write databases to
STEM = "databases\\"

# set range of passes and topics to test
passes = []
topics = []

for p in passes:
    for t in topics:
        
        # create database
        con = sqlite3.connect(STEM + "topics_p" + str(p+"_t"+str(t)+".db"))
        cur = con.cursor()
        
        # train model
        print("Training LDA...")
        model = model(corpus, p, t, 500)           
        print("Done.")
                              
        # write the topic-terms table
        print("Writing topics to terms...")

        cur.execute("""
            CREATE TABLE IF NOT EXISTS TERMS
                (topic INT,
                term TEXT,
                prob FLOAT)
        """)

        for i in range(lda_model.num_topics):
            topics = lda_model.show_topic(i, topn = 20)
            for t in topics:
                cur.execute("INSERT INTO TERMS (topic, term, prob) VALUES (?, ?, ?)", [i+1, t[0], t[1]])

        print("Done.")
                              
        # write the doc-topics table
        print("Writing doc to topics...")

        cur.execute("""
            CREATE TABLE IF NOT EXISTS DOCS
                (doc TEXT,
                topic INT,
                prob FLOAT)
        """)

        for ID in docIDs:

            doc = docTokens[ID]
            store = list(model.get_document_topics(dictionary.doc2bow(doc)))

            for pair in store:

                cur.execute("INSERT INTO DOCS (doc, topic, prob) VALUES (?, ?, ?)", [docID, pair[0], pair[1]])

        print("Done.")


Perform cyclical tests, with a lower bound.

In [55]:
def cyclic(minp, rat, p, its):
    
    done = False
    newcorpus = copy.copy(corpus)
    newIDs = docIDs.copy()
    totalTopics = 0
    
    while not done:
        
        t = math.ceil(len(newcorpus) * rat)
        totalTopics += t
        
        newmodel = model(newcorpus, t, p, its)
        
        print(newmodel.print_topics())
        
        badIDs = list()
        
        for ID in newIDs:
            doc = docTokens[ID]
            store = list(newmodel.get_document_topics(dictionary.doc2bow(doc), minimum_probability = minp))
            
            if len(store) == 0:
                badIDs.append(ID)
        
        if len(badIDs) == 0:
            done = True
        else:
            newIDs = badIDs
            newcorpus = [dictionary.doc2bow(docTokens[ID]) for ID in newIDs]
            print(len(badIDs))
            
    print("Total topics: " + str(totalTopics))

In [68]:
cyclic(0.8, 0.01, 100, 500)

[(0, '0.002*"n_hydrogen_bond" + 0.001*"n_hydrogen" + 0.001*"n_o" + 0.001*"pi_interaction" + 0.001*"molecular_conformation" + 0.001*"n_o_h" + 0.001*"ring_system" + 0.001*"weak_c_h" + 0.001*"intermolecular_n_h" + 0.001*"intramolecular_o_h"'), (1, '0.001*"weak_intermolecular_c_h" + 0.001*"n_atom" + 0.001*"intermolecular_n_h" + 0.001*"o_o_h" + 0.001*"dimensional_network" + 0.001*"chair_conformation" + 0.001*"fuse_ring" + 0.001*"deg_dihedral_angle" + 0.001*"n_hydrogen_bond" + 0.001*"intermolecular_c_h"'), (2, '0.001*"intermolecular_n_h" + 0.001*"o_o_h" + 0.001*"n_hydrogen_bond" + 0.001*"n_atom" + 0.001*"site_symmetry" + 0.001*"chloride_reaction_product" + 0.001*"asymmetric_unit_molecule" + 0.001*"benzene_ring" + 0.000*"hydrogen_bond_interaction" + 0.000*"dimensional_structure"'), (3, '0.001*"n_o_h" + 0.001*"n_o" + 0.001*"intermolecular_n_h" + 0.001*"benzene_ring" + 0.001*"n_hydrogen_bond" + 0.001*"supramolecular_network" + 0.001*"c_o" + 0.001*"intermolecular_c_h" + 0.001*"b_axis_chain" + 0.

<gensim.interfaces.TransformedCorpus object at 0x0000020BEED768D0>


### Coherence Testing

Test a range for the ideal number of topics.

In [None]:
topics = range(1, 21)
passes = 5
iterations = 500

model_list = list()
coherence_values = list()

for topicnum in topics:

    model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=topicnum, id2word=dictionary, passes=passes, workers =4, iterations=iterations)
    model_list.append(model)
    
    coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())


plt.plot(topics, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

### Visualization

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)