In [1]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math
import itertools

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import HdpModel
from gensim.models import TfidfModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

# sci-kit
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import feature_extraction




In [81]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [1, 2, 3]

# format [term, orig, sentence, docID]
inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()

# ignore headers
next(inReader)

for inRow in inReader:
    
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    # find acceptable tokens only
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    # calculate new term level
    level = token.count("_")
    
    # if acceptable, add to dictionary
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]
                        
docIDs = list(docTokens.keys())
texts = list(docTokens.values())


In [87]:
dictionary = corpora.Dictionary(texts)
print(len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.2, keep_n=10000)
print(len(dictionary))

corpus = [dictionary.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

16391
825


In [102]:
topics = 5
passes = 20
iterations = 500

lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=topics, id2word=dictionary, passes=passes, workers =4)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic) + "\n")


Topic: 0 Word: 0.031*"structure_factor" + 0.025*"x_ray" + 0.024*"room_temperature" + 0.023*"datum_collection" + 0.017*"bragg_reflection" + 0.017*"diffraction_datum" + 0.016*"x_ray_diffraction" + 0.016*"cu_atom" + 0.015*"coordination_geometry" + 0.014*"square_planar"

Topic: 1 Word: 0.026*"n_o" + 0.026*"centroid_distance" + 0.024*"n_o_h" + 0.024*"o_h" + 0.024*"pi_pi" + 0.019*"water_molecule" + 0.018*"centroid_centroid_distance" + 0.018*"dimensional_network" + 0.015*"square_pyramidal" + 0.014*"o_hydrogen_bond"

Topic: 2 Word: 0.057*"single_crystal" + 0.047*"x_ray_diffraction" + 0.032*"x_ray" + 0.028*"x_ray_diffraction_datum" + 0.026*"octahedral_coordination" + 0.018*"octahedral_coordination_geometry" + 0.018*"solid_state" + 0.017*"molecular_structure" + 0.017*"distort_octahedral_coordination_geometry" + 0.016*"da_minus"

Topic: 3 Word: 0.049*"n_h" + 0.041*"intermolecular_n_h" + 0.035*"n_hydrogen_bond" + 0.034*"o_hydrogen_bond" + 0.028*"o_h" + 0.024*"n_hydrogen" + 0.024*"asymmetric_unit" 

In [96]:
id_topic_ratio = 0.005
resistance = 0.7
done = False
numTops = 5

topicPath = "topics.csv"
relationPath = "relations.csv"

topicFile = open(topicPath, 'w')
topicOut =  csv.writer(topicFile, lineterminator = '\n')
topicOut.writerow(["row", "run", "topic", "terms", "p"])


relationFile = open(relationPath, 'w')
relationOut = csv.writer(relationFile, lineterminator = '\n')
relationOut.writerow(["run", "topic", "IDs", "ID/strength"])

run = 1
totalTopics = 0
averageCoherence = 0
badIDs = docIDs

while not done:
    
    print("run #" + str(run))
    
    doc2topic = dict()
    topic2doc = dict()
    
    
    oldIDs = badIDs.copy()
    badIDs = list()
    
    totalTopics += numTops
    
    #perform LDA
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=numTops, id2word=dictionary, passes=10, workers=4)
    
    coherenceModel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = coherenceModel.get_coherence()
    averageCoherence = ((totalTopics-numTops) * averageCoherence + numTops*coherence)/totalTopics
    
    # tag documents
    for ID in oldIDs:
        
        doc = docTokens[ID]
        vec = dictionary.doc2bow(doc)

        store = lda_model[vec]

        bestRel = 0

        # build relations
        for pair in store:
            
            bestRel = max(bestRel, pair[1])

            if pair[0] in topic2doc:
                topic2doc[pair[0]] += [(ID, pair[1])]
            else:
                topic2doc[pair[0]] = [(ID, pair[1])]

        # collect bad docs    
        if bestRel < resistance:

            badIDs.append(ID)
    
    
    #write terms
    
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)])

        
    terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False)
    
    
    # print relations
    for topic in topic2doc:
        relationOut.writerow([run, topic, len(topic2doc[topic])]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True))
    
    
    
    # done?
    if len(badIDs) == 0:
        done = True
        print("Done!")
    
    # if not, build new corpus
    else:
        print(len(badIDs))
        corpus = [dictionary.doc2bow(docTokens[docID]) for docID in badIDs]
        len(corpus)
        numTops = math.ceil(len(badIDs) * id_topic_ratio)
        run += 1

        
print(totalTopics)
print(averageCoherence)

topicFile.close()
relationFile.close()

run #1


KeyboardInterrupt: 