## Topic Model v3

#### Import

We utilize the gensim library for topic modeling algorithms.

In [1]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import HdpModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

# sci-kit
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import feature_extraction



#### Preprocessing

Specify pathway to R&R program output. Select terms to blacklist and levels of terms to consider. The tokens will be loaded into a corpus and a dictionary will be constructed.

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [1, 2, 3]

# format [term, orig, sentence, docID]
inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()

# ignore headers
next(inReader)

for inRow in inReader:
    
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    # find acceptable tokens only
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    # calculate new term level
    level = token.count("_")
    
    # if acceptable, add to dictionary
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())


In [3]:
dictionary = corpora.Dictionary(data)
texts = data

corpus = [dictionary.doc2bow(text) for text in texts]

#### LDA Recursion

Continually perform LDA topic modeling until every document has an acceptable strength *p* to a topic.

In [40]:
id_topic_ratio = 0.1
resistance = 0.3
done = False
numTops = 10

topicPath = "topics.csv"
relationPath = "relations.csv"

topicFile = open(topicPath, 'w')
topicOut =  csv.writer(topicFile, lineterminator = '\n')
topicOut.writerow(["", "run", "topic", "terms", "p"])


relationFile = open(relationPath, 'w')
relationOut = csv.writer(relationFile, lineterminator = '\n')
relationOut.writerow(["run", "topic", "IDs", "ID/strength"])

run = 1
totalTopics = 0
averageCoherence = 0
badIDs = docIDs

while not done:
    
    print("run #" + str(run))
    
    doc2topic = dict()
    topic2doc = dict()
    
    
    oldIDs = badIDs.copy()
    badIDs = list()
    
    totalTopics += numTops
    
    #perform LDA
    hdp = HdpModel(corpus, dictionary, T=numTops)

    lda_model = hdp.suggested_lda_model()
    
    coherenceModel = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
    coherence = coherenceModel.get_coherence()
    averageCoherence = ((totalTopics-numTops) * averageCoherence + numTops*coherence)/totalTopics
    
    # tag documents
    for ID in oldIDs:
        
        doc = docTokens[ID]
        vec = dictionary.doc2bow(doc)

        store = lda_model[vec]

        bestRel = 0

        # build relations
        for pair in store:
            
            bestRel = max(bestRel, pair[1])

            if pair[0] in topic2doc:
                topic2doc[pair[0]] += [(ID, pair[1])]
            else:
                topic2doc[pair[0]] = [(ID, pair[1])]

        # collect bad docs    
        if bestRel < resistance:

            badIDs.append(ID)
    
    
    #write terms
    
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)])

        
    terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False)
    
    
    # print relations
    for topic in topic2doc:
        relationOut.writerow([run, topic, len(topic2doc[topic])]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True))
    
    
    
    # done?
    if len(badIDs) == 0:
        done = True
        print("Done!")
    
    # if not, build new corpus
    else:
        print(len(badIDs))
        corpus = [dictionary.doc2bow(docTokens[docID]) for docID in badIDs]
        len(corpus)
        numTops = math.ceil(len(badIDs) * id_topic_ratio)
        run += 1

        
print(totalTopics)
print(averageCoherence)

topicFile.close()
relationFile.close()

run #1


  start_time = time.clock()


38
run #2


  start_time = time.clock()


Done!
14
0.8259739656285559


#### Thinning

Deploy cosine similarity to remove similar topics.

In [None]:
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [None]:
# specify location of topic output
topicPath = "data\\topics_init10_rat0.1_res_0.5.csv"

topicFile = open(topicPath, 'r')
topicReader = csv.reader(topicFile)

next(topicReader)
next(topicReader)


currentTopic = 0
topicNumber = 0

topic2term = dict()

for row in topicReader:
    
    term = row[3]
    p = row[4]
    
    if not row[2] == currentTopic:
        currentTopic = row[2]
        topicNumber += 1
        topic2term[topicNumber] = [term]
    else:
        topic2term[topicNumber].append(term)

topicNums = topic2term.keys()

for t1, t2 in itertools.combinations(topicNums, 2):
    if get_cosine_sim(" ".join(topic2term[t1]), " ".join(topic2term[t2])) > 0.5:
        print(topic2term[t1])
        print(topic2term[t2])
    
    
    