In [97]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import os, sys

# sys.path.insert(0, '/n/fs/ac-project/COS561/COS561_final_project/src')
sys.path.insert(0, 'src')
from fs_utils import remove_and_create_dir

In [2]:
print(os.getcwd())

/n/fs/ac-project/COS561/COS561_final_project


In [105]:
def preprocess_csvs(csv_paths: list,
                    verbose: bool,
                    csv_save_dir: str=None):

    non_existent_paths = []
    for csv_path in csv_paths:
        if not os.path.exists(csv_path):
            non_existent_paths.append(csv_path)

    if len(non_existent_paths) > 0:
        raise ValueError("{} paths in csv_paths do not exist: {}".format(len(non_existent_paths), non_existent_paths))

    n_csvs = len(csv_paths)
    for csv_idx, csv_path in enumerate(csv_paths):
        if verbose:
            print("Processing {}/{} csv: {}".format(csv_idx + 1, n_csvs, csv_path))

        df = df = pd.read_csv(csv_path)
        df = df.sort_values(['cur_hub', 'cur_port', 'etime'])
        df['time_diff'] = df['etime'] - df['timestamp (sec)']
        x_cols = [
            'index', #PID?
            'pkt len (byte)', # packet length
            'priority',
            'src_pc',
            'cur_port' # in port
        ]
        y_col = ['time_diff']

        unique_ports = df['cur_port'].unique()
        unique_hubs = df['cur_hub'].unique()
        
        new_dfs = []
        for hub in unique_hubs:
            for port in unique_ports:
                cur_hub_port_df = df.loc[(df['cur_port'] == port) & (df['cur_hub'] == hub)].copy()
                len_data = len(cur_hub_port_df)
                if verbose:
                    print("hub {} port {} has {} rows".format(hub, port, len_data))
                # print(cur_hub_port_df['timestamp (sec)'], cur_hub_port_df['etime'])
                # print(cur_hub_port_df)
                loads = []
                # Calculate the load at the current port for each row
                for row_idx, row in cur_hub_port_df.iterrows():
                    ingress_time = row['timestamp (sec)']
                    egress_time = row ['etime']
                    # load = count number of rows that have timestamp or etime between [ingress, egress]
                    load_rows = cur_hub_port_df[
                        # other row start in the middle of current row
                        ((cur_hub_port_df['timestamp (sec)'] >= ingress_time) & (cur_hub_port_df['timestamp (sec)'] < egress_time)) | 
                        # other row ends in middle of current row
                        ((cur_hub_port_df['etime'] >= ingress_time) & (cur_hub_port_df['etime'] < egress_time)) | 
                        # other row starts before current row and ends after current row
                        ((cur_hub_port_df['timestamp (sec)'] < ingress_time) & (cur_hub_port_df['etime'] > egress_time))]
                    # Count number of rows 
                    load = len(load_rows)
                    # if load > 10:
                    #     print("ingress time {} egress time {}".format(ingress_time, egress_time))
                    #     print(load_rows['timestamp (sec)'], load_rows['etime'])
                    loads.append(load)
                # Assign load column and append to list of dataframe 
                cur_hub_port_df['load'] = loads
                new_dfs.append(cur_hub_port_df)
        # Concatenate data frames for each device/port combination
        new_df = pd.concat(new_dfs)
        
        # Calculate average load for each port
        for port in unique_ports:
            avg_load = np.mean(new_df[new_df['cur_port'] == port]['load'].to_numpy())
            new_df['mean_load_port_{}'.format(port)] = avg_load
            if verbose:
                print("average load for port {}: {}".format(port, avg_load))
        
        # Save new CSV
        csv_save_name = os.path.splitext(os.path.basename(csv_path))[0] + '_processed.csv'
        csv_save_path = os.path.join(csv_save_dir, csv_save_name)
        
        new_df.to_csv(csv_save_path)
        print("Saved processed csv to {}\n".format(csv_save_path))
                

In [106]:
data_root_dir =  os.path.join('data', 'dqn_data') # TODO: replace with your root if necessary
csv_path = os.path.join('data', 'rsim1.csv')
csv_dummy_path = os.path.join(os.path.dirname(csv_path), 'rsim1_dummy.csv')

csv_save_dir = os.path.join('data', 'processed_data')
remove_and_create_dir(csv_save_dir)
preprocess_csvs(
    [csv_dummy_path], 
    verbose=True,
    csv_save_dir=csv_save_dir)


Processing 1/1 csv: data/rsim1_dummy.csv
hub 12 port 0 has 18 rows
hub 12 port 1 has 28 rows
hub 13 port 0 has 12 rows
hub 13 port 1 has 20 rows
hub 14 port 0 has 17 rows
hub 14 port 1 has 31 rows
hub 15 port 0 has 23 rows
hub 15 port 1 has 24 rows
hub 16 port 0 has 18 rows
hub 16 port 1 has 28 rows
hub 17 port 0 has 32 rows
hub 17 port 1 has 27 rows
hub 18 port 0 has 17 rows
hub 18 port 1 has 36 rows
hub 19 port 0 has 33 rows
hub 19 port 1 has 35 rows
average load for port 0: 8.329411764705883
average load for port 1: 10.240174672489083
Saved processed csv to data/processed_data/rsim1_dummy_processed.csv



In [33]:
# class TracesDataset(Dataset):
#     def __init__(self,
#                  csv_paths: list,
#                  n_timesteps: int,
#                  y_label: str):

#         self.n_timesteps = n_timesteps
#         self.indices = [] # Tuples of (csv_idx, device_idx, row_idx)
#         self.xs = []
#         self.ys = []
#         for csv_idx, csv_path in enumerate(csv_paths):
#             # load CSV, create empty list
#             # separate by device and store each df as numpy array in a list
#             # for each device, get range of valid start idxs to have complete timeseries data
#             # make into tuples with csv_idx and device_idx and append to self.indices
#             df = pd.read_csv(csv_path)
#             unique_devices = df['cur_hub'].unique()
#             xs_csv = []
#             ys_csv = []
#             for device_idx, unique_device in enumerate(unique_devices):
#                 device_data = df.loc[df['cur_hub'] == unique_device]
#                 print(device_data.columns)
#                 # Separate x and y values for this device
#                 xs_device = device_data.drop(y_label, axis=1).to_numpy()
#                 ys_device = device_data[y_label].to_numpy()
                
#                 # Append to list of data for each csv
#                 xs_csv.append(xs_device)
#                 ys_csv.append(ys_device)
                
#                 # Calculate indices for timeseries
#                 n_rows = device_data.shape[0]
#                 n_timeseries_data = n_rows - self.n_timesteps + 1
#                 # Create tuples using CSV index, device index, and rows
#                 timeseries_idxs = [(csv_idx, device_idx, row_idx) for row_idx in range(n_timeseries_data)]
#                 self.indices += timeseries_idxs

#             self.xs.append(xs_csv)
#             self.ys.append(ys_csv)
            
#     def __getitem__(self, index):
#         csv_idx, device_idx, row_start_idx = self.indices[index]
#         xs = self.xs[csv_idx][device_idx][row_start_idx:row_start_idx + self.n_timesteps]
#         ys = self.ys[csv_idx][device_idx][row_start_idx:row_start_idx + self.n_timesteps]
#         return xs, ys

#     def __len__(self):
#         return len(self.indices)

In [35]:
data_root_dir =  os.path.join('data', 'dqn_data') # TODO: replace with your root if necessary
csv_path = os.path.join('data', 'rsim1.csv')

csv_dummy_path = os.path.join(os.path.dirname(csv_path), 'rsim1_dummy.csv')
dataset = TracesDataset(
    csv_paths=[csv_path],  # can change this to csv_path, but it takes a very long time to load!
    n_timesteps=15,
    y_label='etime')

print(len(dataset))

for idx, (x, y) in enumerate(dataset):
    print(x.shape, y.shape)
    
    if idx == 10:
        break

Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      dtype='object')
Index(['index', 'timestamp (sec)', 'pkt len (byte)', 'priority', 'src_pc',
       'cur_hub', 'cur_port', 'path', 'etime'],
      d