In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import os
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from itertools import combinations

In [2]:
INTERVAL = 30

features = ["packets", "bytes", "flows", "packets/flows", "bytes/flows", "bytes/packets", "flows/(bytes/packets)", "nodes"]

filenames = glob(Path(f'interval{INTERVAL}_reconstructed_STL_trend/200702111400/*/*').__str__())
targetFilename = filenames[0].replace(f'interval{INTERVAL}_reconstructed_STL_trend', f'interval{INTERVAL}_reconstructed_STL_trend_ts')
dirname = os.path.join(*os.path.dirname(targetFilename).split(os.path.sep)[:-1])
targetFilename = os.path.join(dirname, "pyts_dataset.npy")
if os.path.exists(targetFilename):
    exit()

timeseries = {}
for feature in features:
    result = {}
    for filename in filenames:
        with open(filename) as f:
            lines = f.readlines()
            header, data = lines[0], lines[1:]
            index = header.strip().split(',').index(feature)
            value = [float(line.split(',')[index]) for line in data]
        column = os.path.splitext(os.path.basename(filename))[0].split('-')[1]
        result[column] = value
        
    timeseries[feature] = result

os.makedirs(os.path.dirname(targetFilename), exist_ok=True)
pyts_dataset = np.array(np.array(pd.DataFrame.from_dict(timeseries)).tolist())
np.save(targetFilename, pyts_dataset)

with open(os.path.join(f'interval{INTERVAL}_reconstructed_STL_trend_ts/200702111400', 'timeseries_dictionary.json'), 'w') as f:
    json.dump(timeseries, f, indent=4)

sampleFilename = os.path.splitext(targetFilename)[0] + "_sample.txt"
featureFilename = os.path.splitext(targetFilename)[0] + "_feature.txt"

with open(sampleFilename, "w") as f:
    f.write('\n'.join(list(timeseries[features[0]].keys())))
    
with open(featureFilename, "w") as f:
    f.write('\n'.join(features))

In [3]:
pyts_dataset.shape

(102925, 8, 31)

In [4]:
INTERVAL = 30

if 'timeseries' not in locals():
    with open(os.path.join(f'interval{INTERVAL}_reconstructed_STL_trend_ts/200702111400', 'timeseries_dictionary.json')) as f:
        timeseries = json.load(f)

features = ["packets", "bytes", "flows", "packets/flows", "bytes/flows", "bytes/packets", "flows/(bytes/packets)", "nodes"]
filenames = glob(Path(f'interval{INTERVAL}_reconstructed_STL_trend/200702111400/*/*').__str__())
targetFilename = filenames[0].replace(f'interval{INTERVAL}_reconstructed_STL_trend', f'interval{INTERVAL}_reconstructed_STL_trend_ts_feature')

for feature in combinations(features, 2):
    dirname = os.path.join(*os.path.dirname(targetFilename).split(os.path.sep)[:-1], f"{feature[0].replace('/', '_')}-{feature[1].replace('/', '_')}")
    targetFilename = os.path.join(dirname, "pyts_dataset.npy")
    if os.path.exists(targetFilename):
        continue
    
    partial_timeseries = {key: value for key, value in timeseries.items() if key in feature}
    
    os.makedirs(os.path.dirname(targetFilename), exist_ok=True)
    pyts_dataset = np.array(np.array(pd.DataFrame.from_dict(partial_timeseries)).tolist())
    np.save(targetFilename, pyts_dataset)
    
    featureFilename = os.path.splitext(targetFilename)[0] + "_feature.txt"

    with open(featureFilename, "w") as f:
        f.write('\n'.join(feature))
    
    del partial_timeseries, pyts_dataset

In [5]:
# np.load("reconstructed_STL_trend_ts/200702111400/pyts_dataset.npy").shape

In [6]:
# # 12 samples, 8 features, 901 timestamps
# np.array(np.array(pd.DataFrame.from_dict(timeseries)).tolist()).shape

In [7]:
# from tslearn.utils import from_pyts_dataset
# from_pyts_dataset(np.array(np.array(pd.DataFrame.from_dict(timeseries)).tolist())).shape