In [None]:
import pandas as pd
import numpy as np
from numpy import nan
import os

## Environment Data
Data relevant to the spec for each environment

In [None]:
env_data = pd.read_csv("data/env_data.csv", index_col=0)
env_data

# env_data.to_csv("data/env_data.csv")

## Runtime Data
Data corresponding to the cpu/gpu runtimes for each environment

In [None]:
# This cell contains function to extract data from profiling folder

models=["CNN", "DNN", "LSTM", "transformer"]
runtime_data_columns = ['Environment','trial','c_CNN_time','c_DNN_time','c_LSTM_time','c_T_time','g_CNN_time','g_DNN_time','g_LSTM_time','g_T_time']

# Parse datafile to get runtime
def get_time(fp):
    with open(fp, "r") as f:
        for line in f:
            pass
        # Last line
        time = line.split(":")[1].replace("\n","").replace(" ", "")
        return time

# Given an environment name, return a 2d array
# Each row has the trial number, and the cpu & gpu runtimes for all 4 models
# So each row has 9 values
def collect_runtime_data(env):
    
    cpu_out = []
    gpu_out = []
    out = []
    
    e_fp = "profiling/" + env
    trials = os.listdir(e_fp)
    
    # For each trial for this env
    for trial in trials:
        
        cpu_fp = e_fp + "/" + trial + "/cpu"
        
        # If we have cpu data
        if os.path.isdir(cpu_fp):
            
            # Get runtimes for each model
            for model in models:
                data_fp = cpu_fp + "/" + model + "/time.txt"
                if os.path.isfile(data_fp):
                    cpu_out += [get_time(data_fp)]
                else:
                    # if this model did not have data
                    cpu_out += [nan]
        # If we don't have cpu data
        else:
            cpu_out = [nan]*4
        
        gpu_fp = e_fp + "/" + trial + "/gpu"
        
        # If we have gpu data
        if os.path.isdir(gpu_fp):
            
            # Get runtimes for each model
            for model in models:
                data_fp = gpu_fp + "/" + model + "/time.txt"
                if os.path.isfile(data_fp):
                    gpu_out += [get_time(data_fp)]
                else:
                    # if this model did not have data
                    gpu_out += [nan]
        # If we don't have gpu data
        else:
            gpu_out = [nan]*4
        
#         print("Trial: " + trial.split("_")[1])
#         print(cpu_out)
#         print(gpu_out)
#         print()
        
        out += [[env] + [int(trial.split("_")[1])] + cpu_out + gpu_out]
        cpu_out = []
        gpu_out = []
        
    return pd.DataFrame(out, columns=runtime_data_columns)

# V   Test line   V
# collect_runtime_data("env7")

In [None]:
# This cell collects data for all envs into one dataframe

envs = ["env2","env3","env4","env5","env6","env7","env8","env9",
       "datahub1","datahub2","datahub3","datahub4","datahub5","datahub6",
       "datahub7","datahub8","datahub9","datahub10"]

runtime_data = collect_runtime_data("env1")
for env in envs:
    runtime_data = pd.concat((runtime_data, collect_runtime_data(env)))
runtime_data = runtime_data.reset_index(drop=True)

# V   Save Data   V
# runtime_data.to_csv("data/runtime_data.csv")

runtime_data

## Large Sample Data

In [None]:
# Parse large_sample_time.txt to get a list of runtimes
def get_large_times(fp):
    out = []
    with open(fp, "r") as f:
        for line in f:
            out += [line.split(":")[1].replace("\n", "").replace(" ", "")]
    return out