In [1]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
import math

# loads the pre-processed variables
%run load_helper.ipynb
#

# ##################################################
#
# SET UP PARAMETERS
# next 2 sections 
# (3rd is opitional if running the "classic" algorithm)
#
# then hit run
#
# ##################################################


# ##################################################
#
# pick a baseline
#        baselines = ["10topics", "15topics", "20topics", "25topics", "30topics"]
#
# name your run (used to store the iterations); models will be saved myrun1.sav myrun2.save etc
#
# ##################################################
mybaseline = "30baseline2"
runname = "Testcleanupclassic"
#runname = "D75T30Improved20"

# ##################################################
#
# set your global parameters
#
# NOTE if you want to run the algorithm with the paper
#      also set parameters in next section
#
# ##################################################

# below are default params that can be played with
lda_decay = float(.5)      # how much the prior influences the iteration 0 - 1 
                           #      mathmatically anything less than .5 is not guartenteed to converge
                           #      however we have tuned the model to work down to .001 (and possibly lower)
                           # if you do set this too low, Gensim will display warnings
            
num_iterations = 14  # NOTE this is in addition to baseline so if you want 15 iterations set this to 14
                     # if you are using the "improved algorithm" set this to about 14

# lag of 5 is mentioned in the paper, and seems to work twith trial runs
the_lag = 5

# ##################################################
#
# OPTIONAL
# if you want to run an iteration with the paper's algorithm (with splitting and buffers)
# set your parameters "classical" parameters
#
# ##################################################

# don't forget to comment these out if you want to go back to "improved" version
classical = 'y'      # uncomment out this line to run the classical algorithm
num_iterations = 4   # if you are going to use the algorthim in the paper (with splitting) set to around 4
lda_decay = float(.001)      # how much the prior influences the iteration 0 - 1 

num_buffers = 0    # how many buffers to add each iteration
drop_percent = float(.95)         # as indicited in the paper, drop below .95 percent
low_threshold = float(.05)        # threshold for the p-values .05 is pretty much expected
ignore_little_counts = float(.2)  # if pos/neg words dominate, ignore the other topic




# ##################################################
#
# The ITMFT algorithm
#
# ##################################################

iteration = 0

#
# load the baseline selected in the parameters above
#

file_name = save_path + mybaseline + ".sav"
model = LdaModel.load(file_name)
    
topics = model.get_topics().copy()
topics = topics.copy()
num_topics = len(topics)
print("Number of Topics = ", num_topics)

run_purity = []
run_confidence = []

mostsigtopics = []
mostsigtopicwords = []
mostsigconf = float(2.0)

while iteration < num_iterations  :
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("\nIteration start time = ", current_time)
    
    # 
    # run either the "classical" or the "improved"
    #
    if classical == 'y'  :
        %run itmtf_withsplit.ipynb 
    else :
        %run itmtf_improved.ipynb 
    
    #
    # run the model
    #
    model = LdaModel(
        corpus=corpus,
        id2word=tokentoword,
        chunksize=chunksize,
        alpha='auto',               
        eta=newtopics,                 # preset topic/word
        iterations=iterations,
        num_topics=num_topics,         # added buffer topics
        passes=passes,
        decay = lda_decay,
        eval_every=eval_every
    )

    topics = model.get_topics().copy()
    num_topics = len(topics)
 
    file_name = runname + str(iteration) 
    path_name = save_path + file_name + ".sav"
    print("Model " + file_name + " - saved for visualization")
    model.save(path_name )
    iteration += 1

#
# after the iterations are done
# run the algorithm once more to gather stats from the last model
#
if classical == 'y'  :
    %run itmtf_withsplit.ipynb 
else :
    %run itmtf_improved.ipynb 
#

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Run Complete = ", current_time)
 
print ("Significant Topics", mostsigtopics)
for ii in range (0, len(mostsigtopics)) :
    print("Top Topic Words: ", mostsigtopics[ii])
    words = " "
    for yy in range (0,10) :
        words = words + tokentoword[mostsigtopicwords[ii][yy][0]] + " "
    print(words)

#
# save the stats
#
path_name = save_path + runname + ".sigwords.csv"
fo = open(path_name, "w")
firstime = 0
for ii in range (0, len(mostsigtopics)) :
    fo.write(str(mostsigtopics[ii]))
    words = " "
    for yy in range (0,10) :
        if pearsoncorr[mostsigtopicwords[ii][yy][0]] > 0 :
            words = words + ",B+: " + tokentoword[mostsigtopicwords[ii][yy][0]]
        else :
            words = words + ",G+: " + tokentoword[mostsigtopicwords[ii][yy][0]]
    words = words + "\n"
    fo.write(words) 
fo.close()  

path_name = save_path + runname + ".confidence.csv"
fo = open(path_name, "w")
firstime = 0
for num in run_confidence :
    if firstime == 0 :
        fo.write(str(num) )
        firstime = 1
    else :      
        fo.write("\n" + str(num))                 
fo.close()   



Number of time slices with docs: 123
Number of time slices: 123
Number of time vocab: 12517
Number of documents: 2673
Number of unique tokens: 12517


In [None]:
# Save model to disk.
temp_file = datapath("it1_model")
model.save(temp_file)
model.show_topic(30, 50)



In [None]:
#temp_file = datapath("baseline_model")
#newmodel = LdaModel.load(temp_file)
#newmodel.show_topic(29, 50) # will blow up if 30

In [None]:
# create a dictionary of probs
doctopic = {}
count = 0
for onebag in bow :
    doctopic[count] = model.get_document_topics(onebag)
    count += 1
print ( len(doctopic))

# get the doc list for this iteration



In [None]:
print (doctopic[0])
    