## Topic Model v3

#### Import

We utilize the gensim library for topic modeling algorithms.

In [1]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math
import itertools

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import HdpModel
from gensim.models import TfidfModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

# sci-kit
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import feature_extraction




#### Preprocessing

Specify pathway to R&R program output. Select terms to blacklist and levels of terms to consider. The tokens will be loaded into a corpus and a dictionary will be constructed.

In [74]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [1, 2, 3]

# format [term, orig, sentence, docID]
inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()

# ignore headers
next(inReader)

for inRow in inReader:
    
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    # find acceptable tokens only
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    # calculate new term level
    level = token.count("_")
    
    # if acceptable, add to dictionary
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]
                        
docIDs = list(docTokens.keys())
texts = list(docTokens.values())


We want terms that are common enough to be shared among documents, but also rare enough to be meaningful.

In [98]:
dictionary = corpora.Dictionary(texts)
print(len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.1, keep_n=1000)
print(len(dictionary))

corpus = [dictionary.doc2bow(text) for text in texts]


16429
825


In [104]:

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=100, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic) + "\n")
    

Topic: 0 Word: 0.049*"intermolecular_n_h" + 0.041*"n_hydrogen_bond" + 0.028*"n_hydrogen" + 0.016*"aromatic_ring" + 0.014*"n_o" + 0.014*"maximum_deviation" + 0.013*"s_hydrogen_bond" + 0.013*"n_o_h" + 0.013*"middot_minus" + 0.012*"o_n_h"

Topic: 1 Word: 0.023*"intermolecular_c_h" + 0.023*"bond_length" + 0.017*"angle_bond_length" + 0.016*"mirror_plane" + 0.015*"crystallographic_symmetry" + 0.014*"intermolecular_interaction" + 0.013*"amino_group" + 0.012*"significant_difference" + 0.012*"ag_atom" + 0.012*"tetrahedral_configuration"

Topic: 2 Word: 0.030*"small_angle" + 0.018*"distribution_function" + 0.017*"absolute_configuration" + 0.014*"magnetic_field" + 0.014*"c_torsion_angle" + 0.013*"x_ray_scatter" + 0.013*"intramolecular_hydrogen_bond" + 0.012*"fuse_ring" + 0.012*"methyl_group" + 0.012*"br_interaction"

Topic: 3 Word: 0.023*"c_n_h" + 0.018*"scatter_curve" + 0.017*"diffraction_pattern" + 0.017*"datum_set" + 0.017*"datum_collection" + 0.016*"structure_type" + 0.014*"real_space" + 0.01

#### LDA Recursion

Continually perform LDA topic modeling until every document has an acceptable strength *p* to a topic=

In [91]:
id_topic_ratio = 0.01
resistance = 0.7
done = False
numTops = 10

topicPath = "topics.csv"
relationPath = "relations.csv"

topicFile = open(topicPath, 'w')
topicOut =  csv.writer(topicFile, lineterminator = '\n')
topicOut.writerow(["row", "run", "topic", "terms", "p"])


relationFile = open(relationPath, 'w')
relationOut = csv.writer(relationFile, lineterminator = '\n')
relationOut.writerow(["run", "topic", "IDs", "ID/strength"])

run = 1
totalTopics = 0
averageCoherence = 0
badIDs = docIDs

while not done:
    
    print("run #" + str(run))
    
    doc2topic = dict()
    topic2doc = dict()
    
    
    oldIDs = badIDs.copy()
    badIDs = list()
    
    totalTopics += numTops
    
    #perform LDA
    hdp = HdpModel(corpus, dictionary, T=numTops)

    lda_model = hdp.suggested_lda_model()
    
    coherenceModel = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
    coherence = coherenceModel.get_coherence()
    averageCoherence = ((totalTopics-numTops) * averageCoherence + numTops*coherence)/totalTopics
    
    # tag documents
    for ID in oldIDs:
        
        doc = docTokens[ID]
        vec = dictionary.doc2bow(doc)

        store = lda_model[vec]

        bestRel = 0

        # build relations
        for pair in store:
            
            bestRel = max(bestRel, pair[1])

            if pair[0] in topic2doc:
                topic2doc[pair[0]] += [(ID, pair[1])]
            else:
                topic2doc[pair[0]] = [(ID, pair[1])]

        # collect bad docs    
        if bestRel < resistance:

            badIDs.append(ID)
    
    
    #write terms
    
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)])

        
    terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False)
    
    
    # print relations
    for topic in topic2doc:
        relationOut.writerow([run, topic, len(topic2doc[topic])]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True))
    
    
    
    # done?
    if len(badIDs) == 0:
        done = True
        print("Done!")
    
    # if not, build new corpus
    else:
        print(len(badIDs))
        corpus = [dictionary.doc2bow(docTokens[docID]) for docID in badIDs]
        len(corpus)
        numTops = math.ceil(len(badIDs) * id_topic_ratio)
        run += 1

        
print(totalTopics)
print(averageCoherence)

topicFile.close()
relationFile.close()

run #1


  start_time = time.clock()


754
run #2


  start_time = time.clock()


510
run #3


  start_time = time.clock()


398
run #4


  start_time = time.clock()


337
run #5


  start_time = time.clock()


294
run #6


  start_time = time.clock()


239
run #7


  start_time = time.clock()


220
run #8


  start_time = time.clock()


211
run #9


  start_time = time.clock()


199
run #10


  start_time = time.clock()


KeyboardInterrupt: 

#### Visualization

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

#### Thinning

Deploy cosine similarity to remove similar topics.

In [4]:
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [None]:
newDict = dictionary.token2id
length = len(dictionary)

# specify location of topic output
topicPath = "data\\topics_init10_rat0.1_res_0.5.csv"

topicFile = open(topicPath, 'r')
topicReader = csv.reader(topicFile)

next(topicReader)
next(topicReader)


currentTopic = 0
topicNumber = 0

topic2term = dict()

for row in topicReader:
    
    term = row[3]
    p = row[4]
    index = newDict[term]
    
    if not row[2] == currentTopic:
        currentTopic = row[2]
        topicNumber += 1
        topic2term[topicNumber] = [0]*length
        
        topic2term[topicNumber][index] = p
    else:
        topic2term[topicNumber][index] = p

topicNums = topic2term.keys()

for t1, t2 in itertools.combinations(topicNums, 2):
    cs = cosine_similarity(np.array([topic2term[t1]]), np.array([topic2term[t2]]))
    if cs[0][0] > 0.05:
        print(cs[0][0])
        print("Topic 1: " + data(t1))
        print("Topic 2: " + data
    