In [None]:
%qtconsole

In [1]:
# the required python libraries imported
import bnpy
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import time
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook
from bokeh.core.validation import silence
from bokeh.core.validation.warnings import MISSING_RENDERERS
from bokeh.layouts import column
from IPython.core.display import display, HTML
import bokeh
bokeh.io.reset_output()
bokeh.io.output_notebook()

In [2]:
# indicates to jupyer how the plots are to be displayed and sized and some other
# housekeeping particular to this notebook
display(HTML("<style>div.output_scroll { height: 600em; }</style>"))
silence(MISSING_RENDERERS, True)
%matplotlib inline
plt.rcParams['figure.figsize'] = [15, 3]

# Theses values need to make sense, mod(data_init_size, batch_size) == 0
init_start = 0
data_start = 20000
data_end = 400000
batch_size = 2000
window_size_in_batches = 5

# all_data contains all the data
all_data = pd.read_csv('../data/anomaly0245.csv')
all_data.drop(all_data.columns[0], inplace=True, axis=1)

# simulation_data is the set used in the simulation 
# not put into a an xdata object, because xdata subsets
# contain metadata about the entire data set, so just testing 
# if that has any side-effects for the algorithm
simulation_data = all_data[data_start:data_end]

# init_data contains the initialization data
init_data = all_data[init_start:window_size_in_batches * batch_size]
init_data = bnpy.data.XData.from_dataframe(init_data)

windows = []
i = 0
while i * batch_size < (len(simulation_data) - window_size_in_batches * batch_size):
    df = simulation_data[i * batch_size:i * batch_size + window_size_in_batches * batch_size]
    windows.append(bnpy.data.XData.from_dataframe(df))
    i += 1

# Graph the data for inspection if required
# p = figure(title="Streaming Data", x_axis_label='x', y_axis_label='y', plot_height=350, plot_width=1200)
# add a line renderer with legend and line thickness
# p.line(all_data.index.tolist(), all_data['anomaly'].tolist(), legend_label="Temp.", line_width=2)
# show the results
# show(p)

In [3]:
gamma = 1.0
sF = 1.0
K = 25  # Initialize K component - this value places a max K the model can develop
nLap = 1000

In [4]:
# vars(warm_start_model.obsModel.Post)
# vars(warm_start_model.obsModel.Prior)

In [5]:
ph = 200
pw = 800
bokeh.io.reset_output()
bokeh.io.output_notebook()
output_notebook()                
p1 = figure(title="Dataset", plot_height=ph, plot_width=pw)
p2 = figure(title="Sufficient Statistics", plot_height=ph, plot_width=pw, x_range=p1.x_range)
p3 = figure(title="Average K Resp", plot_height=ph, plot_width=pw, x_range=p1.x_range)
p6 = figure(title="K", plot_height=ph, plot_width=pw, x_range=p1.x_range)
p = column(p1, p2, p3, p6)
target = show(p, notebook_handle=True)

In [15]:
iname='randexamplesbydist'
opath = f'/tmp/AsteriskK8/coldstart-K=10/b0'  # Dynamic output path according to batch
for ii, window in enumerate(windows):
    start = time.time()

    # Put together the warm start model - starting from the previous iteration 
    output_path = f'/tmp/AsteriskK8/warmstart-K=10/b{ii}'  # Dynamic output path according to batch
    warm_start_model, warm_info_dict = bnpy.run(
        window, 'DPMixtureModel', 'DiagGauss', 'memoVB',
        output_path=output_path,
        nLap=nLap, nTask=1, nBatch=window_size_in_batches, convergeThr=0.0001,
        gamma0=gamma, sF=sF, ECovMat='eye',
        K=K, 
#        moves='birth,merge,delete,shuffle',
        initname=iname,
        ts=True, debug=False, verbose=0)
    iname=warm_info_dict['task_output_path']
    opath = f'/tmp/AsteriskK8/warmstart-K=10/b{ii +  1}'

    LP = warm_start_model.calc_local_params(window)
    SS = warm_start_model.get_global_suff_stats(window, LP)
    K_resp = np.mean(LP["resp"], axis=0)
    x_window  = SS.x
    xx_window = SS.xx
    x_window = np.vstack(x_window)
    xx_window = np.vstack(xx_window)
    
    index = int(ii * batch_size) + data_start + window_size_in_batches * batch_size
    x_window = np.squeeze(np.squeeze(x_window))
    x_window = np.squeeze(np.squeeze(x_window))

    y = np.squeeze(window.X[-2000:])[0::100]
    x = list(range(ii * batch_size + data_start, ii * batch_size + data_start + window_size_in_batches * batch_size))[-2000:][0::100]
    end = time.time()
    elapsed = end - start

    # Setup the bokeh plots and render via call back
    line1 = p1.line(x = x, y = y, color='blue', name='g1', line_width=1)
    line2 = p2.scatter(x = warm_start_model.obsModel.Post.K * [index], y = x_window, color='blue', name='g1', line_width=1)
    line4 = p3.scatter(x = warm_start_model.obsModel.Post.K * [index], y = K_resp , color='red', name='g4')
    line7 = p6.scatter(x = index, y = warm_start_model.obsModel.Post.K , color='red', name='g4')
    push_notebook(handle = target)

  --ts
Dataset Summary:
X Data
  total size: 10000 units
  batch size: 2000 units
  num. batches: 5
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = randexamplesbydist
  K = 25 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b0/1
    0.200/1000 after      0 sec. |    309.4 MiB | K   25 | loss -5.793934430e-01 |  
    0.400/1000 after      0 sec. |    309.4 MiB | K   25 | loss -8.664658724e-01 |  
    0.600/1000 after      0 sec. |    309.4 MiB | K   25 | loss -1.002940894e+00 |  
    1.000/1000 after      0 sec. |    309.4 MiB | K   25 | loss -1.147098397e+00 |  
    2.0

KeyboardInterrupt: 