In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import os, sys
from tqdm import tqdm

# sys.path.insert(0, '/n/fs/ac-project/COS561/COS561_final_project/src')
sys.path.insert(0, 'src')
from fs_utils import remove_and_create_dir
from preprocess_data import preprocess_csvs

In [8]:
data_root_dir =  os.path.join('data') # TODO: replace with your root if necessary
csv_file_names = ['rsim1.csv']
csv_paths = [os.path.join(data_root_dir, csv_file_name) for csv_file_name in csv_file_names]
save_dir = os.path.join('data', 'processed_data')
remove_and_create_dir(save_dir)

preprocess_csvs(
    csv_paths, 
    verbose=True,
    csv_save_dir=save_dir)

Processing 1/1 csv: data/rsim1.csv
Processing device 12 (1/8)
	Processing port 0 (1/2)
	device 12 port 0 has 16926 rows


16926it [00:19, 886.89it/s]


	Processing port 1 (2/2)
	device 12 port 1 has 28418 rows


28418it [00:33, 846.02it/s]



Processing device 13 (2/8)
	Processing port 0 (1/2)
	device 13 port 0 has 10825 rows


10825it [00:11, 902.83it/s]


	Processing port 1 (2/2)
	device 13 port 1 has 19092 rows


19092it [00:21, 881.41it/s]



Processing device 14 (3/8)
	Processing port 0 (1/2)
	device 14 port 0 has 18155 rows


18155it [00:20, 885.75it/s]


	Processing port 1 (2/2)
	device 14 port 1 has 15528 rows


15528it [00:17, 890.63it/s]



Processing device 15 (4/8)
	Processing port 0 (1/2)
	device 15 port 0 has 17538 rows


17538it [00:19, 883.05it/s]


	Processing port 1 (2/2)
	device 15 port 1 has 19364 rows


19364it [00:22, 873.29it/s]



Processing device 16 (5/8)
	Processing port 0 (1/2)
	device 16 port 0 has 17877 rows


17877it [00:20, 875.71it/s]


	Processing port 1 (2/2)
	device 16 port 1 has 15423 rows


15423it [00:17, 883.21it/s]



Processing device 17 (6/8)
	Processing port 0 (1/2)
	device 17 port 0 has 16932 rows


16932it [00:19, 890.97it/s]


	Processing port 1 (2/2)
	device 17 port 1 has 16891 rows


16891it [00:18, 892.11it/s]



Processing device 18 (7/8)
	Processing port 0 (1/2)
	device 18 port 0 has 12989 rows


12989it [00:14, 920.72it/s]


	Processing port 1 (2/2)
	device 18 port 1 has 21757 rows


21757it [00:24, 899.77it/s]



Processing device 19 (8/8)
	Processing port 0 (1/2)
	device 19 port 0 has 21540 rows


21540it [00:23, 902.30it/s]


	Processing port 1 (2/2)
	device 19 port 1 has 18937 rows


18937it [00:20, 911.75it/s]



average load for port 0: 8.47515476495308
average load for port 1: 9.660343607232482
Saved processed csv to data/processed_data/rsim1_processed.csv



In [121]:
def preprocess_csvs(csv_paths: list,
                    verbose: bool,
                    csv_save_dir: str=None):

    non_existent_paths = []
    for csv_path in csv_paths:
        if not os.path.exists(csv_path):
            non_existent_paths.append(csv_path)

    if len(non_existent_paths) > 0:
        raise ValueError("{} paths in csv_paths do not exist: {}".format(len(non_existent_paths), non_existent_paths))

    n_csvs = len(csv_paths)
    for csv_idx, csv_path in enumerate(csv_paths):
        if verbose:
            print("Processing {}/{} csv: {}".format(csv_idx + 1, n_csvs, csv_path))

        df = df = pd.read_csv(csv_path)
        df = df.sort_values(['cur_hub', 'cur_port', 'timestamp (sec)'])
        df['time_diff'] = df['etime'] - df['timestamp (sec)']

        unique_ports = df['cur_port'].unique()
        n_ports = len(unique_ports)
        unique_devices = df['cur_hub'].unique()
        n_devices = len(unique_devices)
        
        # Data structure for storing rows with new load column
        new_dfs = []
        for device_idx, device in enumerate(unique_devices):
            if verbose:
                print("Processing device {} ({}/{})".format(device, device_idx + 1, n_devices))
            for port_idx, port in enumerate(unique_ports):
                print("\tProcessing port {} ({}/{})".format(port, port_idx + 1, n_ports))
                cur_device_port_df = df.loc[(df['cur_port'] == port) & (df['cur_hub'] == device)].copy()
                len_data = len(cur_device_port_df)
                if verbose:
                    print("\tdevice {} port {} has {} rows".format(device, port, len_data))
                loads = []
                # Calculate the load at the current port for each row
                for row_idx, row in tqdm(cur_device_port_df.iterrows()):
                    ingress_time = row['timestamp (sec)']
                    egress_time = row ['etime']
                    # load = count number of rows that have timestamp or etime between [ingress, egress]
                    load_rows = cur_device_port_df[
                        # other row start in the middle of current row
                        ((cur_device_port_df['timestamp (sec)'] >= ingress_time) & (cur_device_port_df['timestamp (sec)'] < egress_time)) | 
                        # other row ends in middle of current row
                        ((cur_device_port_df['etime'] >= ingress_time) & (cur_device_port_df['etime'] < egress_time)) | 
                        # other row starts before current row and ends after current row
                        ((cur_device_port_df['timestamp (sec)'] < ingress_time) & (cur_device_port_df['etime'] > egress_time))]
                    # Count number of rows that match the criteria
                    load = len(load_rows)
                    loads.append(load)
                # Assign load column and append to list of dataframe 
                cur_device_port_df['load'] = loads
                new_dfs.append(cur_device_port_df)
            if verbose:
                print("")
        # Concatenate data frames for each device/port combination
        new_df = pd.concat(new_dfs)
        
        # Calculate average load for each port
        for port in unique_ports:
            avg_load = np.mean(new_df[new_df['cur_port'] == port]['load'].to_numpy())
            new_df['mean_load_port_{}'.format(port)] = avg_load
            if verbose:
                print("average load for port {}: {}".format(port, avg_load))
        
        # Save new CSV
        csv_save_name = os.path.splitext(os.path.basename(csv_path))[0] + '_processed.csv'
        csv_save_path = os.path.join(csv_save_dir, csv_save_name)
        
        new_df.to_csv(csv_save_path)
        print("Saved processed csv to {}\n".format(csv_save_path))
                

In [123]:
data_root_dir =  os.path.join('data', 'dqn_data') # TODO: replace with your root if necessary
csv_path = os.path.join('data', 'rsim1.csv')
csv_dummy_path = os.path.join(os.path.dirname(csv_path), 'rsim1_dummy.csv')

csv_save_dir = os.path.join('data', 'processed_data')
remove_and_create_dir(csv_save_dir)
preprocess_csvs(
    [csv_path], 
    verbose=True,
    csv_save_dir=csv_save_dir)


Processing 1/1 csv: data/rsim1.csv
Processing device 12 (1/8)
	Processing port 0 (1/2)
	device 12 port 0 has 16926 rows


16926it [00:18, 896.82it/s]


	Processing port 1 (2/2)
	device 12 port 1 has 28418 rows


28418it [00:32, 862.93it/s]



Processing device 13 (2/8)
	Processing port 0 (1/2)
	device 13 port 0 has 10825 rows


10825it [00:11, 913.74it/s]


	Processing port 1 (2/2)
	device 13 port 1 has 19092 rows


19092it [00:21, 883.44it/s]



Processing device 14 (3/8)
	Processing port 0 (1/2)
	device 14 port 0 has 18155 rows


18155it [00:20, 891.32it/s]


	Processing port 1 (2/2)
	device 14 port 1 has 15528 rows


15528it [00:17, 899.08it/s]



Processing device 15 (4/8)
	Processing port 0 (1/2)
	device 15 port 0 has 17538 rows


17538it [00:19, 894.55it/s]


	Processing port 1 (2/2)
	device 15 port 1 has 19364 rows


19364it [00:22, 867.28it/s]



Processing device 16 (5/8)
	Processing port 0 (1/2)
	device 16 port 0 has 17877 rows


17877it [00:20, 858.94it/s]


	Processing port 1 (2/2)
	device 16 port 1 has 15423 rows


15423it [00:17, 867.28it/s]



Processing device 17 (6/8)
	Processing port 0 (1/2)
	device 17 port 0 has 16932 rows


16932it [00:19, 866.44it/s]


	Processing port 1 (2/2)
	device 17 port 1 has 16891 rows


16891it [00:19, 869.09it/s]



Processing device 18 (7/8)
	Processing port 0 (1/2)
	device 18 port 0 has 12989 rows


12989it [00:14, 886.59it/s]


	Processing port 1 (2/2)
	device 18 port 1 has 21757 rows


21757it [00:25, 860.69it/s]



Processing device 19 (8/8)
	Processing port 0 (1/2)
	device 19 port 0 has 21540 rows


21540it [00:25, 860.46it/s]


	Processing port 1 (2/2)
	device 19 port 1 has 18937 rows


18937it [00:21, 867.84it/s]



average load for port 0: 8.47515476495308
average load for port 1: 9.660343607232482
Saved processed csv to data/processed_data/rsim1_processed.csv

