In [1]:
# the required python libraries imported
import bnpy
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook
from bokeh.core.validation import silence
from bokeh.core.validation.warnings import MISSING_RENDERERS
from bokeh.layouts import column
from IPython.core.display import display, HTML
import bokeh
bokeh.io.reset_output()
bokeh.io.output_notebook()

# indicates to jupyer how the plots are to be displayed and sized and some other
# housekeeping particular to this notebook
display(HTML("<style>div.output_scroll { height: 600em; }</style>"))
silence(MISSING_RENDERERS, True)
%matplotlib inline
plt.rcParams['figure.figsize'] = [15, 3]

data_start = 0
data_init_size = 20000
batch_size = 2000
batchnum = int(data_init_size/batch_size)

all_data = pd.read_csv('../data/anomaly0245.csv')
all_data.drop(all_data.columns[0], inplace=True, axis=1)

init_data = all_data.head(data_init_size)
init_data = bnpy.data.XData.from_dataframe(init_data)

data_set = bnpy.data.XData.from_dataframe(all_data)

batches = []
i = 300000
while i < len(all_data)- batch_size:
    df = all_data.iloc[i:i + batch_size]
    batches.append(bnpy.data.XData.from_dataframe(df))
    i += batch_size

# p = figure(title="Streaming Data", x_axis_label='x', y_axis_label='y', plot_height=350, plot_width=1200)

# add a line renderer with legend and line thickness
# p.line(all_data.index.tolist(), all_data['anomaly'].tolist(), legend_label="Temp.", line_width=2)

# show the results
# show(p)

anomalies = [78, 98, 99, 104, 105, 106, 122, 124, 127, 128, 129, 130, 131, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166]
cleans = [11, 12, 18, 19, 30, 33, 40, 52, 58, 59, 60, 61, 74, 79, 80, 81, 186, 187, 188, 189, 190, 191, 192, 193]



In [2]:
# Setup the placekeeping and initilizing variables
chain = 0
x, eng_val, states, num_states = [], [], [], []
i = 300000
step = 5000
print(i)

# Initialize bnpy model and do initial training
# *DiagGauss* observation model
gamma = 1.0
sF = 1.0
K = 3  # Initialize K component
nLap = 10

cold_start_model, cold_info_dict = bnpy.run(
    init_data, 'DPMixtureModel', 'DiagGauss', 'memoVB',
    output_path='/tmp/AsteriskK8/coldstart-K=10/',
    nLap=nLap, nTask=1, nBatch=batchnum, convergeThr=0.0001,
    gamma0=gamma, sF=sF, ECovMat='eye',
    K=K, initname='randexamplesbydist', ts=True, debug=False)

# Get the intial graphing data
y = np.squeeze(init_data.X)
x = list(range(0, len(init_data.X)))
x_batches = []
x_batch_post = []
x_batch_pre = []
K_model = []
K_states = []
index = []

warm_start_model = cold_start_model
warm_info_dict = cold_info_dict

300000
  --ts
Dataset Summary:
X Data
  total size: 20000 units
  batch size: 2000 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = randexamplesbydist
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/coldstart-K=10/1
    0.100/10 after      0 sec. |    207.6 MiB | K    3 | loss -1.250621183e+00 |  
    0.200/10 after      0 sec. |    207.6 MiB | K    3 | loss -1.356998690e+00 |  
    0.300/10 after      0 sec. |    207.6 MiB | K    3 | loss -1.406078532e+00 |  
    1.000/10 after      0 sec. |    207.6 MiB | K    3 | loss -1.469587789e+00 |  
    2.000/1

In [3]:
vars(warm_start_model.obsModel.Post)
# vars(warm_start_model.obsModel.Prior)

{'K': 3,
 'D': 1,
 '_FieldDims': {'nu': 'K', 'kappa': 'K', 'm': ('K', 'D'), 'beta': ('K', 'D')},
 'doCollapseK1': False,
 'nu': array([6669.67853958, 6669.66963444, 6669.65182598]),
 'kappa': array([6666.67863958, 6666.66973444, 6666.65192598]),
 'm': array([[25.54308883],
        [25.54308883],
        [25.54308883]]),
 'beta': array([[20.55681059],
        [20.55679775],
        [20.55677204]])}

In [4]:
bokeh.io.reset_output()
bokeh.io.output_notebook()
output_notebook()                
p1 = figure(title="Dataset", plot_height=250, plot_width=1200)
p2 = figure(title="Sufficient Statistics", plot_height=250, plot_width=1200)
p3 = figure(title="K1", plot_height=250, plot_width=1200)
p4 = figure(title="K2", plot_height=250, plot_width=1200)
p5 = figure(title="K3", plot_height=250, plot_width=1200)
p = column(p1, p2, p3, p4, p5)
target = show(p, notebook_handle=True)


In [5]:
st_idx = int(data_init_size/batch_size)
ed_idx = int(data_init_size/batch_size) + 200
# for i in range(st_idx, len(batches)):
for ii, abatch in enumerate(batches):
    LP = warm_start_model.calc_local_params(abatch)
    K_resp = [LP["resp"][:,0], LP["resp"][:,1], LP["resp"][:,2]]
    # Setup the bokeh plots and render via call back
    line1 = p1.line(x = x, y = y, color='blue', name='g1', line_width=1)
    line2 = p2.scatter(x = index, y = x_batch_post , color='red', name='g2')
    line3 = p2.scatter(x = index, y = x_batch_pre , color='blue', name='g3')
    line4 = p3.scatter(x = index, y = K_resp[0] , color='red', name='g4')
    line5 = p4.scatter(x = index, y = K_resp[1] , color='red', name='g4')
    line6 = p5.scatter(x = index, y = K_resp[2] , color='red', name='g4')
    push_notebook(handle = target)
    
    # Shift the dataset to include new incoming data   
#     new_dataset = data_set.make_subset(example_id_list = list(range(i * batch_size - data_init_size, i * batch_size)))
    start_idx = 300000 + (ii*batch_size)
    end_idx = 300000 + ((ii+1)*batch_size)
    new_dataset = data_set.make_subset(example_id_list = list(range(start_idx, end_idx)))
    
    # Check sufficient statistics on the new batch with the previously learned model 
    LPanomaly = []
    SSanomaly = []
#     LP = warm_start_model.calc_local_params(batches[i])
#     K_resp = [LP["resp"][:,0], LP["resp"][:,1], LP["resp"][:,2]]
    LPanomaly.append(LP)  # Calculation of responsibility, needed for next step
    SSanomaly.append(warm_start_model.get_global_suff_stats(abatch, LP))  # Calculation of SS for new data
    x_batch_pre = []
    xx_batch_pre = []
    for key in SSanomaly:
        x_batch_pre.append(key.x)
        xx_batch_pre.append(key.xx)
    x_batch_pre = np.vstack(x_batch_pre)
    xx_batch_pre = np.vstack(xx_batch_pre)
    
    ### CHANGES HERE
    output_path = f'/tmp/AsteriskK8/warmstart-K=10/b{ii}'  # Dynamic output path according to batch
    if ii == 0:  # First batch use the cold start.
        warm_init_path = cold_info_dict['task_output_path']
    else:  # After, use previous warm start.
        warm_init_path = warm_info_dict['task_output_path']
    warm_start_model, warm_info_dict = bnpy.run(
        new_dataset, 'DPMixtureModel', 'DiagGauss', 'memoVB',
        output_path=output_path,
        nLap=nLap, nTask=1, nBatch=batchnum, convergeThr=0.0001,
        gamma0=gamma, sF=sF, ECovMat='eye',
        K=K, initname=warm_init_path, ts=True, debug=True)#     trained_model, trained_dict = bnpy.run(
    # Check sufficient statistics on the new batch with the newly learned model 
    LPanomaly = []
    SSanomaly = []
    LP = warm_start_model.calc_local_params(abatch)
    LPanomaly.append(LP)  # Calculation of responsibility, needed for next step
    SSanomaly.append(warm_start_model.get_global_suff_stats(abatch, LP))  # Calculation of SS for new data
    x_batch_post = []
    xx_batch_post = []
    K_model = []
    K_states = []
    for key in SSanomaly:
        x_batch_post.append(key.x)
        xx_batch_post.append(key.xx)    
        K_model.append(key.K)
    x_batch_post = np.vstack(x_batch_post)
    xx_batch_post = np.vstack(xx_batch_post)
    K_model = np.vstack(K_model)
    
    index = int(ii * batch_size)
    x_batch_pre = np.squeeze(np.squeeze(x_batch_pre))
    x_batch_post = np.squeeze(np.squeeze(x_batch_post))
    K_model = np.sum(x_batch_post > 1)
    y = np.squeeze(abatch.X)
    x = list(range(ii*len(abatch.X), ii*len(abatch.X) + len(abatch.X)))

  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = /tmp/AsteriskK8/coldstart-K=10/1
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b0/1
    0.100/10 after      0 sec. |    209.7 MiB | K    3 | loss -5.195510989e-01 |  
    0.200/10 after      0 sec. |    209.7 MiB | K    3 | loss -8.511151063e-01 |  
    0.300/10 after      0 sec. |    209.7 MiB | K    3 | loss -1.017697797e+00 |  
    1.000/10 after      0 sec. |    209.7 MiB | K    3 | loss -1.407265868e+00 |  
 



    3.000/10 after      0 sec. |    209.7 MiB | K    3 | loss -1.407265861e+00 | Ndiff    0.000 
    4.000/10 after      0 sec. |    209.7 MiB | K    3 | loss -1.407265861e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    209.7 MiB | K    3 | loss -1.407265861e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = /tmp/AsteriskK8/warmstart-K=10/b0/1
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b1/1
    0.100/10 after

  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b6/1
    0.100/10 after      0 sec. |    213.3 MiB | K    3 | loss -5.065147104e-01 |  
    0.200/10 after      0 sec. |    213.3 MiB | K    3 | loss -8.405914711e-01 |  
    0.300/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.012893536e+00 |  
    1.000/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.167422651e+00 |  
    2.000/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.167422642e+00 | Ndiff    0.074 
    3.000/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.167422641e+00 | Ndiff    0.001 
    4.000/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.167422641e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    213.3 MiB | K    3 | loss -1.167422641e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units

    1.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801679763e-01 |  
    2.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801678848e-01 | Ndiff    0.160 
    3.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801678646e-01 | Ndiff    0.007 
    4.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801678634e-01 | Ndiff    0.000 
    5.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801678633e-01 | Ndiff    0.000 
    6.000/10 after      0 sec. |    211.0 MiB | K    3 | loss -4.801678633e-01 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initiali

    5.000/10 after      0 sec. |    213.2 MiB | K    3 | loss -5.858193999e-01 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = /tmp/AsteriskK8/warmstart-K=10/b16/1
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b17/1
    0.100/10 after      0 sec. |    213.7 MiB | K    3 | loss -2.530177559e-01 |  
    0.200/10 after      0 sec. |    213.7 MiB | K    3 | loss -2.933978701e-01 |  
    0.300/10 after      0 sec. |    213.7 Mi

    0.100/10 after      0 sec. |    221.0 MiB | K    3 | loss -5.166732874e-01 |  
    0.200/10 after      0 sec. |    221.0 MiB | K    3 | loss -8.538593867e-01 |  
    0.300/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.028553359e+00 |  
    1.000/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.418903015e+00 |  
    2.000/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.418903012e+00 | Ndiff    0.096 
    3.000/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.418903011e+00 | Ndiff    0.000 
    4.000/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.418903011e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    221.0 MiB | K    3 | loss -1.418903011e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-

    4.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.201346060e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.201346060e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = /tmp/AsteriskK8/warmstart-K=10/b27/1
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b28/1
    0.100/10 after      0 sec. |    224.6 MiB | K    3 | loss -4.822519003e-01 |  
    0.200/10 after      0 sec.

  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b33/1
    0.100/10 after      0 sec. |    220.2 MiB | K    3 | loss -4.924925271e-01 |  
    0.200/10 after      0 sec. |    220.2 MiB | K    3 | loss -7.957047959e-01 |  
    0.300/10 after      0 sec. |    220.2 MiB | K    3 | loss -9.342893474e-01 |  
    1.000/10 after      0 sec. |    220.2 MiB | K    3 | loss -1.209415895e+00 |  
    2.000/10 after      0 sec. |    220.2 MiB | K    3 | loss -1.209415881e+00 | Ndiff    0.075 
    3.000/10 after      0 sec. |    220.2 MiB | K    3 | loss -1.209415881e+00 | Ndiff    0.000 
    4.000/10 after      0 sec. |    220.2 MiB | K    3 | loss -1.209415881e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    220.2 MiB | K    3 | loss -1.209415881e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 unit

    5.000/10 after      0 sec. |    222.2 MiB | K    3 | loss -1.213912085e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-Wishart prior on each dimension
  Wishart params 
    nu = 3 
  beta = [ 1] 
  Expectations
  E[  mean[k]] = 
  [ 0]
  E[ covar[k]] = 
  [[1.]]
Initialization:
  initname = /tmp/AsteriskK8/warmstart-K=10/b38/1
  K = 3 (number of clusters)
  seed = 1607680
  elapsed_time: 0.0 sec
Learn Alg: memoVB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/AsteriskK8/warmstart-K=10/b39/1
    0.100/10 after      0 sec. |    222.6 MiB | K    3 | loss -4.507380456e-01 |  
    0.200/10 after      0 sec. |    222.6 MiB | K    3 | loss -7.694941775e-01 |  
    0.300/10 after      0 sec. |    222.6 Mi

    0.100/10 after      0 sec. |    224.1 MiB | K    3 | loss -4.667668909e-01 |  
    0.200/10 after      0 sec. |    224.1 MiB | K    3 | loss -7.712602144e-01 |  
    0.300/10 after      0 sec. |    224.1 MiB | K    3 | loss -9.260003383e-01 |  
    1.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.225356103e+00 |  
    2.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.225356104e+00 | Ndiff    0.081 
    3.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.225356103e+00 | Ndiff    0.001 
    4.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.225356103e+00 | Ndiff    0.000 
    5.000/10 after      0 sec. |    224.1 MiB | K    3 | loss -1.225356103e+00 | Ndiff    0.000 
... done. converged.
  --ts
Dataset Summary:
X Data
  total size: 400000 units
  batch size: 200 units
  num. batches: 10
Allocation Model:  DP mixture with K=0. Concentration gamma0= 1.00
Obs. Data  Model:  Gaussian with diagonal covariance.
Obs. Data  Prior:  independent Gauss-