In [28]:
import sys
import io
import os.path
import re
import tarfile
from datetime import datetime
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from pprint import pprint

import smart_open

# ##################################################
#
#  Always run this cell
#
#  and either cell 2 (to create a baseline)
#      or cell 3 to load the baseline
#
# Before you run the iterations
# ##################################################


# ##################################################
#             PARAMETERS TO PLAY WITH
#
# decay = used like μ in the algorithm (see notes below)
# num_topics = number of topics to start with
# num_iterations = max number of iterations to run the ITMTF algorithm
# ##################################################


# decay from 0 to 1, .5 - 1 guarenteed to converge
# .5 is model's default
# closer to 1, like a lower μ
#      decay = 1 is like μ = 0
lda_decay = .5    

# number of topics to start with, per the article, 30 is a good start
num_topics = 30
num_buffers = 5   # how many buffers to add each iteration

# max number of iterations to run - the article used 5
num_iterations = 1

# ##################################################
#             Other parameters 
#  used to load the data
#  or default values for the LDA algorithm
# ##################################################
#input parameters
documents_path = ".\\LDA_data\\LDAreduced.csv"
vocab_path = ".\\LDA_data\\LDAwordseries.csv"
save_path = ".\\LDA_data\\"

# model parameters
num_docs = 0
num_words = 0
chunksize = 2000
passes = 100
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

#docs = []
#bow = []     # arrray of bow for doc, used to get probability

#docs_per_timeslice = []

#tokentoword = {}  # used to visualize the results of the model

# as the model's vocab list is not in the same order as our predefined counts we will create these look up tables
# how to use:
#    use the timeslicevocabcounts list to run the ITMTF iteration  
#      (index on the interation we are running)
#      (the ITMFT algorithm will use the index of the count to represent each word)
#    the ITMTF iteration will return new topic(s) using the index of the count array as a representative for the word 
#    use the index to find the model token with wordindextotoken
#vocabtowordindex = {}     # dict of the preped vocabulary words to INDEX used to create wordindextotoken
#wordindextotoken = {}     # will be used to get the model's token after each ITMFT iteration
#timeslicevocabcounts = [] # an array of timeslices, each element contains an array of word counts for that timeslice


# ##################################################
# load the cleansed data into an array of docs
# ##################################################
docs = []
with open(documents_path) as swf:
    docs_per_timeslice = []
    tempslice = []
    count = 0
    curtimeslice = "2000.7.1"
    tempslice.append(curtimeslice)
    curdocs = []
    firstime = 0
    for line in swf:
        cells = line.split(',')
        docslice = cells[0] + "." + cells[1] + "." + cells[2]
        if firstime == 0 :
            firstime = 1
            curtimeslice = docslice
        if docslice != curtimeslice :
            curtimeslice = docslice
            docs_per_timeslice.append(curdocs)
            curdocs = []
        curdocs.append(count) 
        count += 1
        
        docs.append(cells[3])
    docs_per_timeslice.append(curdocs)
swf.close
print('Number of time slices with docs: %d' % len(docs_per_timeslice))

# ##################################################
# load the cleansed vocabulary into vocabtowordindex
#      and timeslicevocabcounts
# ##################################################
# load the cleansed data into an array of docs
header = 0
timeslicevocabcounts = []
vocabtowordindex = {}
with open(vocab_path) as vwf:
    for line in vwf:
        linenumber = 1 # skip the header row
        cells = line.split(',')
        if header == 0 :
            header = 1
            i = 1 # skip header column
            while i < len(cells) - 1:  # the cleansing process adds a black cell at the end
                vocabtowordindex[cells[i]] = i-1  
                #print(cells[i])
                i += 1  
        else :
            wordcount = []
            i = 1 # skip header column
            while i < len(cells)-1:  # the cleansing process adds a black cell at the end
                wordcount.append(cells[i])  # create an array of vocab counts at this timeslice
                i += 1 
            timeslicevocabcounts.append(wordcount)
            
vwf.close
print('Number of time slices: %d' % len(timeslicevocabcounts))
print('Number of time vocab: %d' % len(vocabtowordindex))

# ##################################################
# create the dictionary
# ##################################################
doctokens = [doc.split() for doc in docs]
dictionary = Dictionary(doctokens)
bow = []
# Bag-of-words representation of the documents.
for doc in doctokens :
    bow.append(dictionary.doc2bow(doc))
    
    
# ##################################################
# create the corpus
# ##################################################
corpus = [dictionary.doc2bow(doc) for doc in doctokens]
#print (corpus)

num_docs = len(corpus)
print('Number of documents: %d' % len(corpus))

# ##################################################
# create the wordindextotoken
# a dict so we can take the word back from the iteration and find the dict index
#   to put the probabilities in the right place
# ##################################################
wordindextotoken = {}
i = 0
while i < len(dictionary):  
    wordindextotoken[vocabtowordindex[dictionary[i]]] = i
    i += 1
tokentoword = dictionary.id2token
num_words = len(tokentoword)
print('Number of unique tokens: %d' % len(tokentoword))

# create a probabiltiy array for buffer topics that are added
zeroprobs = []
bufferprob = []
i = 0
while i < len(tokentoword) :
    bufferprob.append(1/len(tokentoword))
    zeroprobs.append(0.0)
    i += 1


Number of time slices with docs: 123
Number of time slices: 123
Number of time vocab: 12517
Number of documents: 2673
Number of unique tokens: 12517


In [29]:
# ##################################################
#             Runs a baseline
# ##################################################

# Make a index to word dictionary.
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

model = LdaModel(
    corpus=corpus,
    id2word=tokentoword,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

print('Model Finished')
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# ##################################################
#             Saves the baseline
# ##################################################

file_name = save_path + "baseline.sav"
print (file_name)
model.save(file_name )

Current Time = 18:35:54
Model Finished
Current Time = 18:45:21
.\LDA_data\baseline.sav


In [26]:
# ##################################################
#             Loads the baseline
# ##################################################
file_name = save_path + "baseline.sav"
model = LdaModel.load(file_name)


In [35]:
# ##################################################
#
# The ITMFT algorithm
#
#  Run cell 1
#  and either cell 2 (to create a baseline)
#      or cell 3 to load the baseline
# ##################################################

iteration = 0
oldsignificanttopics = 0
significanttopcs = 1

while iteration < num_iterations and significanttopcs > oldsignificanttopics :
    # create a topic coverage matrix preset to 0
    topiccoverage = []
    i = 0
    while i < len(docs_per_timeslice) :
        y = 0
        thistopic = []
        while y < num_topics:
            thistopic.append(0.0)
            y += 1
        topiccoverage.append(thistopic)
        i += 1
            
    # get the topic coverage per timeslice per doc
    timeslice = 0
    for timeslicedocs in docs_per_timeslice :
        # for each doc in this timeslice
        for doc in timeslicedocs :
            # get the probability matrix
            probs = model.get_document_topics(bow[doc])
            #its a sparse array, prob[0] is the topic and prob[1] is the probabiltiy
            for prob in probs :
                topiccoverage[timeslice][prob[0]] += prob[1]
        timeslice += 1
        
    # ##################################################
    # timeslicevocabcounts - we have the word coverage 
    # topiccoverage - and now we have the topic coverage
    #
    # run the iteration
    # ##################################################
    #      $$$$$$$$$$$$$$$$$
    #      ADD ALGORITHM HERE 
    #      RETURN A LIST OF newtopics = []
    #          topic word probabilities
    #      AND UPDATE THE VARIABLE significanttopcs
    #
    
    # ##################################################
    #
    # using the returned topics probabilies
    #    correct the words to the dictionary index
    # and adding buffers - num_buffers using bufferprob
    # create the prior
    # update num_topics
    # and run the model
    # ##################################################
    
    #$$$ remove this, for now just creating a topic prob list of X from old model as the return
    topics = model.get_topics()
    print(len(topics))
    newtopics = []
    for topic in topics :
        newtopics.append(topic)
    mylambda = model.state.get_lambda()
    print(len(mylambda))
    print (mylambda)
    #$$$ remove this, for now just creating a topic prob list of X from old model as the return
        
    # fix the words
    # REMOVE COMMENTS WHEN ALGORITM IS PLUGGED IN $$$$$$
    """ 
    inputtopics = []
    for onetopic in newtopics :
        index = 0
        temptopic = zeroprobs
        for wordprob in inputtopics :
            temptopic[wordindextotoken.get(count)] = wordprob
            count += 1
        inputtopics.append(zeroprobs)
    """
            
    
    # add the buffers
    z = 0
    while z < num_buffers:
        newtopics.append(bufferprob)
        z += 1
    num_topics = len(newtopics)

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    model = LdaModel(
        corpus=corpus,
        id2word=tokentoword,
        chunksize=chunksize,
        alpha='auto',               
        eta=newtopics,                 # preset topic/word
        iterations=iterations,
        num_topics=num_topics,       # added a topic
        passes=passes,
        decay = lda_decay,
        eval_every=eval_every
    )

    mylambda = model.state.get_lambda()
    print(len(mylambda))
    print (mylambda)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

    iteration += 1


40
40
[[5.0618539e+00 6.0041053e-08 3.7574844e+02 ... 6.0041053e-08
  6.0041053e-08 6.0041053e-08]
 [1.6895573e-09 1.6895573e-09 3.4720972e-07 ... 1.6895573e-09
  1.6895573e-09 1.6895573e-09]
 [2.1958781e-09 3.8222163e+00 2.8718840e+01 ... 2.1855671e-09
  2.1855671e-09 2.1855671e-09]
 ...
 [7.9891346e-05 7.9891346e-05 4.0658543e+01 ... 7.9891346e-05
  7.9891346e-05 7.9891346e-05]
 [7.9891346e-05 7.9891346e-05 7.9981313e-05 ... 7.9891346e-05
  7.9891346e-05 7.9891346e-05]
 [7.9891346e-05 7.9891346e-05 7.9891346e-05 ... 7.9891346e-05
  7.9891346e-05 7.9891346e-05]]
Current Time = 19:55:46


KeyboardInterrupt: 

In [9]:
probs = model.get_document_topics(bow[0])
print (probs)

[(0, 0.26613462), (1, 0.031513344), (2, 0.07535973), (8, 0.17425913), (11, 0.19474572), (12, 0.079344586), (14, 0.12846163), (23, 0.032250945)]


In [2]:



mylambda = newmodel.state.get_lambda()
print (mylambda)


topics = newmodel.get_topics()

newtopics = []
for topic in topics :
    newtopics.append(topic)

#lets add a new topic
newprob = []
i = 0
while i < 12517 :
    newprob.append(0)
    i += 1
newprob[0] = .5
newprob[1] = .5


newtopics.append(newprob)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
it1model = LdaModel(
    corpus=corpus,
    id2word=tokentoword,
    chunksize=chunksize,
    alpha='auto',               
    eta=newtopics,                 # preset topic/word
    iterations=iterations,
    num_topics=num_topics+1,       # added a topic
    passes=passes,
    decay = .05,
    eval_every=eval_every
)

mylambda = it1model.state.get_lambda()
print (mylambda)
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)


[[4.0134544e+00 9.1549730e+00 7.7883820e+02 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]
 [3.3333335e-02 3.3333335e-02 1.7366159e+02 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]
 [3.3333335e-02 3.3333335e-02 2.5686323e+01 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]
 ...
 [3.3333335e-02 3.3333335e-02 9.0869951e+00 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]
 [3.3333335e-02 3.3333335e-02 7.4259033e+01 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]
 [5.4003153e+00 3.3333335e-02 2.3440340e+02 ... 3.3333335e-02
  3.3333335e-02 3.3333335e-02]]
Current Time = 16:58:09
[[8.0471684e-05 1.8356158e-04 6.4066490e+01 ... 6.6834934e-07
  6.6834934e-07 6.6834934e-07]
 [5.6854842e-06 5.6854842e-06 2.9620504e-02 ... 5.6854842e-06
  5.6854842e-06 5.6854842e-06]
 [7.3313749e-06 7.3313749e-06 5.6494819e-03 ... 7.3313749e-06
  7.3313749e-06 7.3313749e-06]
 ...
 [2.0620668e-05 2.0620668e-05 4.5938123e-02 ... 2.0620668e-05
  2.0620668e-05 2.0620668e-05]
 [3.1441290e-04 1.9407071e-06 1.364

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

In [None]:
mylambda = it1model.state.get_lambda()
print (mylambda)


In [None]:
temp_file = datapath("baseline_model")
newmodel = LdaModel.load(temp_file)

mylambda = newmodel.state.get_lambda()
print (mylambda)

In [None]:
temp_file = datapath("baseline_model")
newmodel = LdaModel.load(temp_file)

topics = model.get_topics()
#print(topics[0])


count = 0
newtopics = []
for topic in topics :
    newtopics.append(topic)
    count += 1
#print(count)

#lets add a new topic
newprob = []
i = 0
while i < 12517 :
    newprob.append(0)
    i += 1
newprob[0] = .5
newprob[1] = .5
print(tokentoword[0])
print(tokentoword[1])
newtopics.append(newprob)

num_topics += 1
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
model = LdaModel(
    corpus=corpus,
    id2word=tokentoword,
    chunksize=chunksize,
    alpha='auto',               
    eta=newtopics,                 # preset topic/word
    iterations=iterations,
    num_topics=num_topics,       # added a topic
    passes=passes,
    eval_every=eval_every
)
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

In [None]:
# Save model to disk.
temp_file = datapath("it1_model")
model.save(temp_file)
model.show_topic(30, 50)



In [None]:
#temp_file = datapath("baseline_model")
#newmodel = LdaModel.load(temp_file)
#newmodel.show_topic(29, 50) # will blow up if 30

In [None]:
# create a dictionary of probs
doctopic = {}
count = 0
for onebag in bow :
    doctopic[count] = model.get_document_topics(onebag)
    count += 1
print ( len(doctopic))

# get the doc list for this iteration



In [None]:
print (doctopic[0])
    