In [3]:
# To allow changes in .py files to be reflected in Notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generate Drift Using MOA

Randomly select streams to use for drift stream generation. Check summary statistics to determine value for n_drift during drift generation. 

In [4]:
from util.generate_moa_stream import GenMOAStream
import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'util.plot_stream'

In [18]:
source_dir = '/home/zengt5/AnomalyDriftDetection/data/benchmark/IOPS'
drift_dir = '/home/zengt5/AnomalyDriftDetection/data/synthetic'
moa_path = '/home/zengt5/moa-release-2023.04.0/lib'
num_streams = 6
g = GenMOAStream(source_dir, drift_dir, moa_path, num_streams)

In [19]:
df = g.get_source_summary()
df

Unnamed: 0,filename,len,num_anomalies,total_anom_cover,avg_anomaly_len,percent_anomalies
22,KPI-6a757df4-95e5-3357-8406-165e2bd49360.test....,110876,14,2638,188.428571,0.023792
26,KPI-6efa3a07-4544-34a0-b921-a155bd1a05e8.test....,149148,119,5283,44.394958,0.035421
36,KPI-a07ac296-de40-3a7c-8df3-91f642cc14d0.test....,111307,68,2259,33.220588,0.020295
38,KPI-a8c06b47-cc41-3738-9110-12df0ee4c721.test....,7578,10,127,12.7,0.016759
48,KPI-c69a50cf-ee03-3bd7-831e-407d36c7ee91.test....,149159,59,702,11.898305,0.004706
52,KPI-e0747cad-8dc8-38a9-a9ab-855b61f5551d.test....,8784,10,116,11.6,0.013206


In [20]:
df.describe()

Unnamed: 0,len,num_anomalies,total_anom_cover,avg_anomaly_len,percent_anomalies
count,6.0,6.0,6.0,6.0,6.0
mean,89475.333333,46.666667,1854.166667,50.373737,0.01903
std,65231.651672,43.807153,1993.338849,68.981426,0.010365
min,7578.0,10.0,116.0,11.6,0.004706
25%,34307.0,11.0,270.75,12.098729,0.014094
50%,111091.5,36.5,1480.5,22.960294,0.018527
75%,139687.75,65.75,2543.25,41.601366,0.022918
max,149159.0,119.0,5283.0,188.428571,0.035421


In [21]:
mean_num_anom = 43
percent = np.arange(0.1,0.8,0.1)
percent_anom = mean_num_anom * percent
pd.DataFrame([percent_anom], columns=percent)

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7
0,4.3,8.6,12.9,17.2,21.5,25.8,30.1


The following parameters can be assigned in the next cell to guide characteristics of the generated drift stream:

- `length`: int, total length of new stream
- `p_drift`: float, target percent of data points classified as drift
- `n_drift`: int, target number of drift sequences
- `p_before`: float, target percent of drift coming before anomaly
- `sub_dir`: string, name of subdirectory to export drift stream
- `dataset`: string, descriptor (name) or source dataset for identification
- `mode`: int, indicator for drift assembly method, options {0,1}, default 0
    - Mode 0: variable drift widths and positions
    - Mode 1: uniform drift widths and positions (helpful for high p_drift)
- Returns `output_path, drift_label, positions, streams, seq_before`

Note: Make sure that `sub_dir` exists in data/synthetic prior to generating the code.

Note: If you encounter trouble with generating large percentage drift, you can use `mode=1`.

In [24]:
length = 110633
n_drift = 10
p_drift = 0.35
p_drift_before = 0.5
sub_dir = 'test'
dataset = 'IOPS'

In [29]:
output_path, drift_label, streams, positions, seq_before, w_drift = g.run_generate_grad_stream_moa(
    length, 
    p_drift, 
    n_drift,
    p_drift_before, 
    sub_dir,
    dataset,
    mode=0 # 0 for variable drift widths and positions, 1 for uniform
)

Generating splits...
Done!
Getting stream file cuts...	Done!
Creating intermediate files...	Done!
Recursively generating MOA command...	Done!
Drift filename:  IOPS_grad_p24_n7_b57
Running terminal command...	


{M}assive {O}nline {A}nalysis
Version:  23.04 April 2023
Copyright: (C) 2007-2023 University of Waikato, Hamilton, New Zealand
Web: http://moa.cms.waikato.ac.nz/

                                                                               
Task completed in 0.45s (CPU time)



Stream written to ARFF file /home/zengt5/AnomalyDriftDetection/data/synthetic/test/IOPS_grad_p24_n7_b57.arff
Done!
Generating drift labels...	Done!


Check out [`view_drift_generation.ipynb`](../view_drift.ipynb) for plotting methods can be used to view generated streams

## Update an Existing Data Stream

#### Read data and set up object to generate new data stream

In [43]:
# Read existing data file
from util.plot_stream import PlotStream

source_dir = "data/benchmark/IOPS" # directory for source streams
drift_dir = "data/synthetic/n_drift/n_a50" # directory for drift stream
filename = "IOPS_grad_p34_n22_b50_bu"
d1 = PlotStream(source_dir, drift_dir, filename)

In [44]:
# Create object to generate new data stream
g2 = GenMOAStream(source_dir, drift_dir, moa_path, selected_streams=d1.source_streams)
positions = d1.positions # this variable returns positions from above with [0] appended to the front
streams = d1.streams
seq_before = d1.seq_before
w_drift = d1.w_drift

#### The following values can be modified individually to make changes to the data stream

In [52]:
# View index values
k = 4
w = 3
index = [i for i in range(k-w,k+w,1)]
pd.DataFrame({'positions': positions[k-w:k+w], 'streams (after)': streams[k-w:k+w], 'seq_before': seq_before[k-w-1:k+w-1], 'w_drift': w_drift[k-w-1:k+w-1]}, index=index)

Unnamed: 0,positions,streams (after),seq_before,w_drift
1,4961,0,False,1154
2,10945,3,False,987
3,16412,2,False,2066
4,21576,3,False,455
5,30348,4,True,1916
6,36282,3,False,1461


In [None]:
streams[60]

In [None]:
positions[2] -= 1200

In [46]:
w_drift[53] = 250

In [31]:
seq_before[84] = True

In [None]:
# Remove a drift by selecting the index to remove (r_i)
r_i = 3
positions = positions[:r_i] + positions[r_i+1:]
streams = streams[:r_i] + streams[r_i+1:]
w_drift[r_i] = w_drift[r_i-1] + w_drift[r_i]
w_drift = w_drift[:r_i-1] + w_drift[r_i:]
seq_before = seq_before[:r_i-1] + seq_before[r_i:]

In [390]:
index = [i for i in range(k-w,k+w,1)]
pd.DataFrame({'positions': positions[k-w:k+w], 'streams (after)': streams[k-w:k+w], 'w_drift': w_drift[k-w-1:k+w-1]}, index=index)

Unnamed: 0,positions,streams (after),w_drift
30,98398,5,344
31,108581,4,5097
32,110528,0,24
33,116615,4,1675


#### Generate new data stream based on updated drift characteristics

In [51]:
output_path, drift_label =  g2.assemble_drift_stream(
    positions[1:-1],
    streams,
    w_drift,
    seq_before,
    sub_dir='n_drift/n_a50',
    length=d1.length,
    dataset='IOPS'
)

Getting stream file cuts...	Done!
Creating intermediate files...	

Done!
Recursively generating MOA command...	Done!
Drift filename:  IOPS_grad_p35_n21_b50
Running terminal command...	


{M}assive {O}nline {A}nalysis
Version:  23.04 April 2023
Copyright: (C) 2007-2023 University of Waikato, Hamilton, New Zealand
Web: http://moa.cms.waikato.ac.nz/

                                                                               
Task completed in 0.51s (CPU time)



Stream written to ARFF file /home/zengt5/AnomalyDriftDetection/data/synthetic/n_drift/n_a50/IOPS_grad_p35_n21_b50.arff
Done!
Generating drift labels...	Done!
