In [None]:
import sys
import io
import os.path
import re
import tarfile
from datetime import datetime
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from pprint import pprint

import smart_open

# ##################################################
#
#  Always run this cell
#
#  and either cell 2 (to create a baseline)
#      or cell 3 to load the baseline
#
# Before you run the iterations
# ##################################################


# ##################################################
#             PARAMETERS TO PLAY WITH
#
# decay = used like μ in the algorithm (see notes below)
# num_topics = number of topics to start with
# num_iterations = max number of iterations to run the ITMTF algorithm
# ##################################################


# decay from 0 to 1, .5 - 1 guarenteed to converge
# .5 is model's default
# closer to 1, like a lower μ
#      decay = 1 is like μ = 0
lda_decay = .5    

# number of topics to start with, per the article, 30 is a good start
num_topics = 30
num_buffers = 5   # how many buffers to add each iteration

# max number of iterations to run - the article used 5
num_iterations = 1

# ##################################################
#             Other parameters 
#  used to load the data
#  or default values for the LDA algorithm
# ##################################################
#input parameters
documents_path = ".\\LDA_data\\LDAreduced.csv"
vocab_path = ".\\LDA_data\\LDAwordseries.csv"
save_path = ".\\LDA_data\\"

# model parameters
num_docs = 0
num_words = 0
chunksize = 2000
passes = 100
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

#docs = []
#bow = []     # arrray of bow for doc, used to get probability

#docs_per_timeslice = []

#tokentoword = {}  # used to visualize the results of the model

# as the model's vocab list is not in the same order as our predefined counts we will create these look up tables
#   to create timeslicetokencounts
#
#vocabtowordindex = {}     # dict of the preped vocabulary words to INDEX used to create wordindextotoken
#wordindextotoken = {}     # will be used to create timeslicetokencounts
#timeslicevocabcounts = [] # an array of timeslices, each element contains an array of word counts for that timeslice
#                          # used to create timeslicetokencounts
#timeslicetokencounts = [] # an array of timeslices, each element contains an array of dictionary token counts for that timeslice
#                          # this one will be used by the iteration

# ##################################################
# load the cleansed data into an array of docs
# ##################################################
docs = []
with open(documents_path) as swf:
    docs_per_timeslice = []
    tempslice = []
    count = 0
    curtimeslice = "2000.7.1"
    tempslice.append(curtimeslice)
    curdocs = []
    firstime = 0
    for line in swf:
        cells = line.split(',')
        docslice = cells[0] + "." + cells[1] + "." + cells[2]
        if firstime == 0 :
            firstime = 1
            curtimeslice = docslice
        if docslice != curtimeslice :
            curtimeslice = docslice
            docs_per_timeslice.append(curdocs)
            curdocs = []
        curdocs.append(count) 
        count += 1
        
        docs.append(cells[3])
    docs_per_timeslice.append(curdocs)
swf.close
print('Number of time slices with docs: %d' % len(docs_per_timeslice))

# ##################################################
# load the cleansed vocabulary into vocabtowordindex
#      and timeslicevocabcounts
# ##################################################
# load the cleansed data into an array of docs
header = 0
timeslicevocabcounts = []
vocabtowordindex = {}
with open(vocab_path) as vwf:
    for line in vwf:
        linenumber = 1 # skip the header row
        cells = line.split(',')
        if header == 0 :
            header = 1
            i = 1 # skip header column
            while i < len(cells) - 1:  # the cleansing process adds a black cell at the end
                vocabtowordindex[cells[i]] = i-1  
                #print(cells[i])
                i += 1  
        else :
            wordcount = []
            i = 1 # skip header column
            while i < len(cells)-1:  # the cleansing process adds a black cell at the end
                wordcount.append(cells[i])  # create an array of vocab counts at this timeslice
                i += 1 
            timeslicevocabcounts.append(wordcount)
            
vwf.close
print('Number of time slices: %d' % len(timeslicevocabcounts))
print('Number of time vocab: %d' % len(vocabtowordindex))

# ##################################################
# create the dictionary
# ##################################################
doctokens = [doc.split() for doc in docs]
dictionary = Dictionary(doctokens)
bow = []
# Bag-of-words representation of the documents.
for doc in doctokens :
    bow.append(dictionary.doc2bow(doc))
    
    
# ##################################################
# create the corpus
# ##################################################
corpus = [dictionary.doc2bow(doc) for doc in doctokens]
#print (corpus)

num_docs = len(corpus)
print('Number of documents: %d' % len(corpus))

# ##################################################
# create the wordindextotoken
# a dict so we can take a vocab word and find the dict index
#   use this dict to create timeslicetokencounts for the iteration
# ##################################################
wordindextotoken = {}
i = 0
while i < len(dictionary):  
    wordindextotoken[vocabtowordindex[dictionary[i]]] = i
    i += 1
tokentoword = dictionary.id2token
num_words = len(tokentoword)
print('Number of unique tokens: %d' % len(tokentoword))

# ok we now have wordindextotoken map
# let's use that to create timeslicetokencounts from timeslicevocabcounts
timeslicetokencounts = timeslicevocabcounts
timeslicecount = 0
while timeslicecount < len(timeslicevocabcounts) :
    wordcount = 0
    while wordcount < len(timeslicevocabcounts[timeslicecount]) :
        timeslicetokencounts[timeslicecount][wordindextotoken[wordcount]] = timeslicevocabcounts[timeslicecount][wordcount]
        wordcount += 1  
    timeslicecount += 1
        

# create a probabiltiy array for buffer topics that are added
zeroprobs = []
bufferprob = []
i = 0
while i < len(tokentoword) :
    bufferprob.append(1/len(tokentoword))
    zeroprobs.append(0.0)
    i += 1
