## Preprocessing Data Notebook

In [73]:
import os, sys
import pandas as pd
import numpy as np
from multiprocessing import Process, Manager
import numba
import shutil
from datetime import datetime
import glob
from sklearn.model_selection import train_test_split
import h5py

sys.path.insert(0, 'src')
from fs_utils import remove_and_create_dir
from dqn_src.config import BaseConfig
from dqn_src.traffic_indicator import TrafficIndicatorFeature
from dqn_src import builtseq

In [74]:
config = BaseConfig()
target = ['time_in_sys']
np.random.seed(config.seed)

# CHANGE THIS IF NEEDED
data_root = os.path.join('/scratch', 'achen', 'COS561', 'deepqueuenet','data')

In [75]:
def multi_process(func, FILES, args=None):
    print("in multi_process()")
    print("Length of FILES: {}".format(len(FILES)))
    it = 0
    while True:
        files = FILES[it * config.no_process:(it + 1) *
                      config.no_process]
        print("[{}] Itr {} processing {} files".format(datetime.now().strftime(r'%m%d_%H%M%S'), it+1, len(files)))
        if len(files) > 0:
            threads = []
            for file in files:
                ARGS = list(args)
                ARGS.append(file)
                t = Process(target=func, args=tuple(ARGS))
                threads.append(t)
                t.start()
            for thr in threads:
                thr.join()
            it += 1
        else:
            break

In [76]:


def feature_extraction(config: BaseConfig,
                       target: list[str],
                       data_root: str):
    """feature extraction module"""

    # @numba.jit
    def gettraffic(dst_folder, my_fet, file):
        df = pd.read_csv(file).fillna(config.sp_wgt)

        #traffic load features
        ins = TrafficIndicatorFeature(df, config.no_of_port, config.no_of_buffer,
                      config.window, config.ser_rate)
        C_dst_SET, LOAD = ins.getCount()
        for i in range(config.no_of_port):
            df['TI%i' % i] = LOAD[i]
        for i in range(config.no_of_port):
            for j in range(config.no_of_buffer):
                df['load_dst{}_{}'.format(i, j)] = C_dst_SET[(i, j)]

        #arrival patterns
        df['inter_arr_sys'] = df['timestamp (sec)'].diff()
        if config.no_of_buffer > 1:
            for i in range(config.no_of_buffer):
                t = df[df['priority'] ==
                       i]['timestamp (sec)'].diff().rename(
                           'inter_arr{}'.format(i)).to_frame()
                df = df.join(t)

        #save
        filename = file.split('/')[-1]
        drop_cols = ['timestamp (sec)'] + target
        if config.no_of_buffer == 1: drop_cols += ['priority']
        fet_cols = list(df.columns.drop(drop_cols))
        my_fet['fet_cols'] = fet_cols
        df[fet_cols + target].fillna(method='ffill').dropna().to_csv(
            '{}/{}'.format(dst_folder, filename), index=False)

    with Manager() as MG:
        my_fet = MG.dict()
        for mode in ['train', 'test']:
            file_dir = os.path.join(data_root, '/{}/_traces/_{}'.format(
                config.modelname, mode))
            FILES = []
            for dirpath, dirnames, filenames in os.walk(file_dir):
                for file in filenames:
                    if (os.path.splitext(file)[1]
                            == '.csv') and 'checkpoint' not in file:
                        FILES.append(os.path.join(dirpath, file))
            
            dst_folder = os.path.join(data_root, '{}/_traces/{}'.format(
                config.modelname, mode))
            if os.path.exists(dst_folder):
                shutil.rmtree(dst_folder)
            os.makedirs(dst_folder)
            multi_process(gettraffic,
                               FILES,
                               args=(dst_folder, my_fet))

        fet_cols = my_fet['fet_cols']

In [78]:
train_features_path = os.path.join(data_root, config.modelname, '_traces', 'train')
test_features_path = os.path.join(data_root, config.modelname, '_traces', 'test')
if os.path.exists(train_features_path) and os.path.exists(test_features_path):
    print("Extracted features already exist at {} and {}".format(train_features_path, test_features_path))
else:
    feature_extraction(
        config=config,
        target=target,
        data_root=data_root)

Extracted features already exist at /scratch/achen/COS561/deepqueuenet/data/4-port switch/FIFO/_traces/train and /scratch/achen/COS561/deepqueuenet/data/4-port switch/FIFO/_traces/test


### Convert CSV -> .h5 files

In [70]:
def load_hdf(file):
    with h5py.File(file, 'r') as hdf:
        x = hdf['x'][:]
        y = hdf['y'][:]
    return x, y

def write_hdf(file, x, y):
    with h5py.File(file, 'w') as hdf:
        hdf['x'] = x
        hdf['y'] = y

def write_hdf2(h, key, x, y):
    h['{}_x'.format(key)] = x
    h['{}_y'.format(key)] = y

def csv_2hdf(data_root: str,
             config: BaseConfig):
    """
    Build timeseries batches for bLSTM and save them in .hdf files
      - split train mode files for train and in-sample testing (test1);
      - save test mode files for out-of-sample testing (test2).
    """

    # @numba.jit
    def split_2hdf(mode, file):
        key = file.split('/')[-1].split('.csv')[0]
        t = pd.read_csv(file)
        # os.remove(file)

        ins = builtseq.build_timeseries(t.values, target_col_index=[-1])
        x, y = ins.timeseries(config.TIME_STEPS)
        "randomly selected part of the them. to represent the config"
        loc = np.random.choice(
            np.arange(len(y)),
            np.max([np.min([14000, len(y)]),
                    int(len(y) * 0.15)]),
            replace=False)
        x = x[loc]
        y = y[loc]
        if mode == 'train':
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=config.test_size, shuffle=True)
            write_hdf(
                os.path.join(data_root, '{}/_hdf/train/{}.h5'.format(
                    config.modelname, key)), x_train, y_train)
            write_hdf(
                os.path.join(data_root, '{}/_hdf/test1/{}.h5'.format(
                    config.modelname, key)), x_test, y_test)
        else:
            write_hdf(
                os.path.join(data_root, '{}/_hdf/test2/{}.h5'.format(
                    config.modelname, key)), x, y)

    for mode in ['train', 'test']:
        folder = os.path.join(data_root, '{}/_traces/{}'.format(config.modelname, mode))
        # Create directories for each split
        hdf_folder = os.path.join(data_root, config.modelname, '_hdf')
        splits = ['train', 'test1', 'test2']
        for split in splits:
            split_folder = os.path.join(hdf_folder, split)
            if os.path.exists(split_folder):
                shutil.rmtree(split_folder)
            os.makedirs(split_folder)
            print("Created directory for {}".format(split_folder))
        print("FILES folder: {}".format(folder))
        FILES = glob.glob('{}/*.csv'.format(folder))
        multi_process(split_2hdf, FILES, args=(mode, ))
        # shutil.rmtree(folder)

In [80]:
train_hdf_path = os.path.join(data_root, config.modelname, '_hdf', 'train')
test1_hdf_path = os.path.join(data_root, config.modelname, '_hdf', 'test1')
test2_hdf_path = os.path.join(data_root, config.modelname, '_hdf', 'test2')
if os.path.exists(train_features_path) and os.path.exists(test1_hdf_path) and os.path.exists(test2_hdf_path):
    print("HD5 already exist at {}, {}, and {}".format(train_features_path, test1_hdf_path, test2_hdf_path))
else:
    csv_2hdf(
        config=config,
        data_root=data_root)

HD5 already exist at /scratch/achen/COS561/deepqueuenet/data/4-port switch/FIFO/_traces/train, /scratch/achen/COS561/deepqueuenet/data/4-port switch/FIFO/_hdf/test1, and /scratch/achen/COS561/deepqueuenet/data/4-port switch/FIFO/_hdf/test2
