## Topic Model v3

#### Import

We utilize the gensim library for topic modeling algorithms.

In [1]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt


#sci-kit
from sklearn import feature_extraction



#### Preprocessing

Specify pathway to R&R program output. Select terms to blacklist and levels of terms to consider. The tokens will be loaded into a corpus and a dictionary will be constructed.

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [1, 2, 3]

# format [term, orig, sentence, docID]
inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()

# ignore headers
next(inReader)

for inRow in inReader:
    
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    # find acceptable tokens only
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    # calculate new term level
    level = token.count("_")
    
    # if acceptable, add to dictionary
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())

In [3]:
dictionary = corpora.Dictionary(data)
texts = data

corpus = [dictionary.doc2bow(text) for text in texts]

#### LDA Recursion

Continually perform LDA topic modeling until every document has an acceptable strength *p* to a topic.

In [17]:
id_topic_ratio = 0.25
resistance = 0.8
done = False
numTops = 100

topicPath = "topics.csv"
relationPath = "relations.csv"


topicOut =  csv.writer(open(topicPath, 'w'), lineterminator = '\n')
topicOut.writerow(["", "run", "topic", "terms", "p"])

relationOut = csv.writer(open(relationPath, 'w'), lineterminator = '\n')
relationOut.writerow(["run", "topic", "ID/strength"])

run = 1

badIDs = docIDs

while not done:
    
    doc2topic = dict()
    topic2doc = dict()
    
    
    oldIDs = badIDs.copy()
    badIDs = list()
    
    #perform LDA
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=numTops, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=8,
                                       passes=1,
                                       alpha='auto',
                                       per_word_topics=True)
    
    
    # tag documents
    for ID in oldIDs:
        
        doc = docTokens[ID]
        
        

        store = list(lda_model[dictionary.doc2bow(doc)])[0]


        bestRel = 0

        # build relations
        for pair in store:
            
            bestRel = max(bestRel, pair[1])

            if pair[0] in topic2doc:
                topic2doc[pair[0]] += [(ID, pair[1])]
            else:
                topic2doc[pair[0]] = [(ID, pair[1])]

        # collect bad docs    
        if bestRel < resistance:

            badIDs.append(ID)
    
    
    
    #write terms
    
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)])

    terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False)
    
    
    # print relations
    for topic in topic2doc:
        relationOut.writerow([run, topic]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True))
    
    
    # done?
    if len(badIDs) == 0:
        done = True
    
    # if not, build new corpus
    else:
        
        print(run)
        print(len(badIDs))
        
        corpus = [dictionary.doc2bow(docTokens[docID]) for ID in badIDs]
        len(corpus)
        numTops = math.ceil(len(badIDs) * id_topic_ratio)
        run += 1
    

(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.993736)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(

(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(5

(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.993736)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.9936358)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51, 0.99358445)
(51