In [2]:
# To allow changes in .py files to be reflected in Notebook
%load_ext autoreload
%autoreload 2

## Generate Drift Using MOA

Randomly select streams to use for drift stream generation. Check summary statistics to determine value for n_drift during drift generation. 

In [13]:
from util.drift_generator import DriftGenerator
import numpy as np
import pandas as pd

In [31]:
source_dir = '/home/zengt5/AnomalyDriftDetection/data/benchmark/IOPS'
drift_dir = '/home/zengt5/AnomalyDriftDetection/data/synthetic'
moa_path = '/home/zengt5/moa-release-2023.04.0/lib'
num_streams = 6
g = DriftGenerator(source_dir, drift_dir, moa_path, num_streams)

In [33]:
df = g.get_source_summary()
df

Unnamed: 0,filename,len,num_anomalies,total_anom_cover,avg_anomaly_len,percent_anomalies
1,MBA_ECG14046_data_1.arff,229900,447,36478,81.606264,0.158669
8,MBA_ECG14046_data_16.arff,229900,154,12238,79.467532,0.053232
15,MBA_ECG14046_data_22.arff,229900,124,9675,78.024194,0.042084
16,MBA_ECG14046_data_23.arff,229900,124,9300,75.0,0.040452
26,MBA_ECG14046_data_32.arff,229900,132,10288,77.939394,0.04475
32,MBA_ECG14046_data_38.arff,229900,360,27841,77.336111,0.1211


In [34]:
df.describe()

Unnamed: 0,len,num_anomalies,total_anom_cover,avg_anomaly_len,percent_anomalies
count,6.0,6.0,6.0,6.0,6.0
mean,229900.0,223.5,17636.666667,78.228916,0.076715
std,0.0,142.539468,11620.33486,2.204059,0.050545
min,229900.0,124.0,9300.0,75.0,0.040452
25%,229900.0,126.0,9828.25,77.486932,0.04275
50%,229900.0,143.0,11263.0,77.981794,0.048991
75%,229900.0,308.5,23940.25,79.106698,0.104133
max,229900.0,447.0,36478.0,81.606264,0.158669


In [36]:
mean_num_anom = 223
percent = np.arange(0.1,0.8,0.1)
percent_anom = mean_num_anom * percent
pd.DataFrame([percent_anom], columns=percent)

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7
0,22.3,44.6,66.9,89.2,111.5,133.8,156.1


The following parameters can be assigned in the next cell to guide characteristics of the generated drift stream:

- `length`: int, total length of new stream
- `p_drift`: float, target percent of data points classified as drift
- `n_drift`: int, target number of drift sequences
- `p_before`: float, target percent of drift coming before anomaly
- `sub_dir`: string, name of subdirectory to export drift stream
- `dataset`: string, descriptor (name) or source dataset for identification
- `mode`: int, indicator for drift assembly method, options {0,1}, default 0
    - Mode 0: variable drift widths and positions
    - Mode 1: uniform drift widths and positions (helpful for high p_drift)
- Returns `output_path, drift_label, positions, streams, seq_before`

Note: Make sure that `sub_dir` exists in data/synthetic prior to generating the code.

Note: If you encounter trouble with generating large percentage drift, you can use `mode=1`.

In [70]:
length = 137609
n_drift = 66
p_drift = 0.35
p_drift_before = 0.5
sub_dir = 'test'
dataset = 'ECG'

In [71]:
output_path, drift_label, streams, positions, seq_before, w_drift = g.run_generate_grad_stream_moa(
    length, 
    p_drift, 
    n_drift,
    p_drift_before, 
    sub_dir,
    dataset,
    mode=0 # 0 for variable drift widths and positions, 1 for uniform
)

Generating splits...
	Getting partitions...
	Getting order of drifts coming before anomaly...
	Getting drift center positions...
	Getting stream file cuts...
Done!
Getting stream file cuts...	Done!
Creating intermediate files...	Done!
Recursively generating MOA command...	Done!
Drift filename:  ECG_grad_p34_n66_b50
Running terminal command...	


{M}assive {O}nline {A}nalysis
Version:  23.04 April 2023
Copyright: (C) 2007-2023 University of Waikato, Hamilton, New Zealand
Web: http://moa.cms.waikato.ac.nz/

                                                                               
Task completed in 1.72s (CPU time)



Stream written to ARFF file /Users/tammyz/Desktop/AnomalyDriftDetection/data/synthetic/test/ECG_grad_p34_n66_b50.arff
Done!
Generating drift labels...	Done!


Check out [`view_drift_generation.ipynb`](../view_drift.ipynb) for plotting methods can be used to view generated streams

## Update an Existing Data Stream

#### Read data and set up object to generate new data stream

In [73]:
# Read existing data file
from util.stream import DriftStream

# source_dir = "data/benchmark/IOPS" # directory for source streams
source_dir = '/Users/tammyz/Desktop/AnomalyDriftDetection/data/benchmark/ECG'
output_path = '/Users/tammyz/Desktop/AnomalyDriftDetection/data/synthetic/test/ECG_grad_p34_n66_b50.arff'
d1 = DriftStream(output_path, source_dir)

In [75]:
# Create object to generate new data stream
g2 = DriftGenerator(source_dir, drift_dir, moa_path, selected_streams=d1.source_streams)
positions = d1.positions # this variable returns positions from above with [0] appended to the front
streams = d1.streams
seq_before = d1.seq_before
w_drift = d1.w_drift

#### The following values can be modified individually to make changes to the data stream

In [52]:
# View index values
k = 4
w = 3
index = [i for i in range(k-w,k+w,1)]
pd.DataFrame({'positions': positions[k-w:k+w], 'streams (after)': streams[k-w:k+w], 'seq_before': seq_before[k-w-1:k+w-1], 'w_drift': w_drift[k-w-1:k+w-1]}, index=index)

Unnamed: 0,positions,streams (after),seq_before,w_drift
1,4961,0,False,1154
2,10945,3,False,987
3,16412,2,False,2066
4,21576,3,False,455
5,30348,4,True,1916
6,36282,3,False,1461


In [None]:
streams[60]

In [None]:
positions[2] -= 1200

In [46]:
w_drift[53] = 250

In [31]:
seq_before[84] = True

In [None]:
# Remove a drift by selecting the index to remove (r_i)
r_i = 3
positions = positions[:r_i] + positions[r_i+1:]
streams = streams[:r_i] + streams[r_i+1:]
w_drift[r_i] = w_drift[r_i-1] + w_drift[r_i]
w_drift = w_drift[:r_i-1] + w_drift[r_i:]
seq_before = seq_before[:r_i-1] + seq_before[r_i:]

In [390]:
index = [i for i in range(k-w,k+w,1)]
pd.DataFrame({'positions': positions[k-w:k+w], 'streams (after)': streams[k-w:k+w], 'w_drift': w_drift[k-w-1:k+w-1]}, index=index)

Unnamed: 0,positions,streams (after),w_drift
30,98398,5,344
31,108581,4,5097
32,110528,0,24
33,116615,4,1675


#### Generate new data stream based on updated drift characteristics

In [51]:
output_path, drift_label =  g2.assemble_drift_stream(
    positions[1:-1],
    streams,
    w_drift,
    seq_before,
    sub_dir='n_drift/n_a50',
    length=d1.length,
    dataset='IOPS'
)

Getting stream file cuts...	Done!
Creating intermediate files...	

Done!
Recursively generating MOA command...	Done!
Drift filename:  IOPS_grad_p35_n21_b50
Running terminal command...	


{M}assive {O}nline {A}nalysis
Version:  23.04 April 2023
Copyright: (C) 2007-2023 University of Waikato, Hamilton, New Zealand
Web: http://moa.cms.waikato.ac.nz/

                                                                               
Task completed in 0.51s (CPU time)



Stream written to ARFF file /home/zengt5/AnomalyDriftDetection/data/synthetic/n_drift/n_a50/IOPS_grad_p35_n21_b50.arff
Done!
Generating drift labels...	Done!
