In [1]:
import pandas as pd
import numpy as np
from numpy import nan
import os
from collections import defaultdict

## Environment Data
Data relevant to the spec for each environment

In [2]:
env_data = pd.read_csv("data/env_data.csv", index_col=0)
env_data

# env_data.to_csv("data/env_data.csv")

Unnamed: 0,Environment,Instance,vcpu,cpu_mem (GiB),cpu_type,gpu,gpu_mem (GiB),gpu_type
0,env1,c7a.medium,1,2.0,AMD EPYC Gen4,0,,
1,env2,c7a.large,2,4.0,AMD EPYC Gen4,0,,
2,env3,c7a.xlarge,4,8.0,AMD EPYC Gen4,0,,
3,env4,c7a.2xlarge,8,16.0,AMD EPYC Gen4,0,,
4,env5,c6i.xlarge,4,8.0,Intel Xeon 8375C (Ice Lake),0,,
5,env6,m6i.xlarge,4,16.0,Intel Xeon 8375C (Ice Lake),0,,
6,env7,r6i.xlarge,4,32.0,Intel Xeon 8375C (Ice Lake),0,,
7,env8,r6i.large,2,16.0,Intel Xeon 8375C (Ice Lake),0,,
8,env9,m6i.xlarge,4,16.0,Intel Xeon 8375C (Ice Lake),0,,
9,env10,c6i.2xlarge,8,16.0,Intel Xeon 8375C (Ice Lake),0,,


## Runtime Data
Data corresponding to the cpu/gpu runtimes for each environment

In [3]:
# Parse datafile to get runtime
def get_time(fp):
    if ("gpu" not in fp):
        with open(fp, "r") as f:
            for line in f:
                if ("Self CPU time" in line):
                    time = line.split(":")[1].replace("\n","").replace(" ", "")
            return time
    else:
        with open(fp, "r") as f:
            out = []
            for line in f:
                if ("Self CPU time" in line):
                    out += [line.split(":")[-1].strip().replace("\n", "")]
                elif ("Self CUDA time" in line):
                    out += [line.split(":")[-1].strip().replace("\n", "")]
            return out
        
get_time("profiling/datahub10/trial_1/gpu/DNN/time.txt")

['11.884ms', '2.466ms']

In [4]:
executions = {}

# For all files in the profiling/ folder
for path, subdirs, files in os.walk("profiling/"):
    
    # Group all data files with each executiob
    for name in files:
        fp = os.path.join(path, name).replace("\\", "/")
        if ("time.txt" in fp and "large" not in fp):
            
            att = fp.split("/")
            if (att[1], att[2]) not in executions.keys():
                executions[(att[1], att[2])] = [fp]
            else:
                executions[(att[1], att[2])] += [fp]
  
data = []
DataSample = {}
for ex in executions:
    DataSample["Environment"] = ex[0]
    DataSample["Trial"] = ex[1].split("_")[-1]
    
    for fp in executions[ex]:
        att = fp.split("/")
        if (att[3] == "cpu"):
            DataSample["c_" + att[4] + "_time"] = get_time(fp)
        else:
            DataSample["g_" + att[4] + "_time"] = get_time(fp)
            
    data += [DataSample]
    DataSample = {}

data = pd.DataFrame(data)
data = data.drop(data[data['Environment'] == 'datahubtest'].index)
data = data.drop(data[data['Environment'] == 'test_env'].index)
data

Unnamed: 0,Environment,Trial,c_CNN_time,c_DNN_time,c_LSTM_time,c_transformer_time,g_CNN_time,g_DNN_time,g_transformer_time,g_LSTM_time
0,datahub1,1,10.059s,13.676s,87.395s,68.193s,"[1.451s, 1.159ms]","[22.576ms, 2.880ms]","[81.142ms, 9.617ms]",
1,datahub10,1,22.238ms,50.162ms,274.854ms,261.713ms,"[1.309s, 930.000us]","[11.884ms, 2.466ms]","[32.986ms, 6.056ms]",
2,datahub2,1,4.267s,4.488s,62.788s,15.780s,"[1.384s, 1.154ms]","[17.723ms, 2.864ms]","[92.822ms, 9.504ms]",
3,datahub3,1,675.796ms,1.191s,7.188s,4.690s,"[1.573s, 1.138ms]","[18.172ms, 2.858ms]","[60.954ms, 9.515ms]",
4,datahub4,1,26.471ms,235.421ms,789.155ms,1.127s,"[1.549s, 1.174ms]","[20.413ms, 2.864ms]","[59.692ms, 9.808ms]",
5,datahub5,1,119.312ms,147.005ms,813.041ms,495.764ms,"[1.469s, 1.139ms]","[16.187ms, 2.862ms]","[57.146ms, 9.494ms]",
6,datahub6,1,19.016s,55.792ms,1971.799s,166.497s,"[1.115s, 935.000us]","[12.194ms, 2.468ms]","[41.017ms, 6.058ms]",
7,datahub7,1,8.634s,46.449ms,903.916s,67.301s,"[1.240s, 935.000us]","[12.700ms, 2.475ms]","[33.814ms, 6.028ms]",
8,datahub8,1,291.756ms,56.872ms,909.861ms,6.987s,"[1.254s, 932.000us]","[20.241ms, 2.464ms]","[34.279ms, 6.034ms]",
9,datahub9,1,83.435ms,49.028ms,380.283ms,701.389ms,"[1.491s, 936.000us]","[11.830ms, 2.481ms]","[33.039ms, 6.075ms]",


In [5]:
# data.to_csv("data/runtime_data.csv")

## Large Sample Data

In [6]:
# Parse large_sample_time.txt to get a list of runtimes
def get_large_times(fp):
    out = []
    with open(fp, "r") as f:
        for line in f:
            out += [[line.split(" ")[0], line.split(":")[1].replace("\n", "").replace(" ", "")]]
    return out

# get_large_times("profiling/datahub6/trial_1/cpu/LSTM/large_sample_time.txt")

In [7]:
data = []

# For all files in the profiling/ folder
for path, subdirs, files in os.walk("profiling/"):
    
    # Process large_sample_time.txt files
    for name in files:
        fp = os.path.join(path, name).replace("\\", "/")
        
        DataSample = {}
        if ("large_sample_time.txt" in fp):
            
            # Collect data for this sample
            att = fp.split("/")
            DataSample["Environment"] = att[1]
            DataSample["Trial"] = att[2].split("_")[1]
            DataSample["Hardware"] = att[3]
            DataSample["Model"] = att[4]
            
            for ls_time in get_large_times(fp):
                DataSample[ls_time[0] + "_samples"] = ls_time[1]
                
            data.append(DataSample)
            
ls_data = pd.DataFrame(data)
ls_data["5000_samples"] = ls_data.pop("5000_samples")
ls_data

Unnamed: 0,Environment,Trial,Hardware,Model,50_samples,100_samples,500_samples,1000_samples,2000_samples,5000_samples
0,datahub1,1,cpu,CNN,3.0022855649585836,6.584249189007096,6.470418524986599,9.318261566979345,,14.207533341948874
1,datahub1,1,cpu,DNN,5.385303889983334,4.988413676037453,10.940336151979864,13.659167839039583,,72.59668333100853
2,datahub1,1,cpu,LSTM,60.165907881979365,91.9948421830195,219.34383639501175,347.4516330069746,528.6369186760276,
3,datahub1,1,cpu,transformer,42.72831858799327,53.8845977029996,129.26617205003276,226.27289840998128,,
4,datahub1,1,gpu,CNN,0.002213948988355696,0.002042289008386433,0.0035906920093111694,0.003689524019137025,,0.004648194997571409
...,...,...,...,...,...,...,...,...,...,...
86,test_env,1,cpu,transformer,0.6687056000000027,1.4039049999999982,6.1737246,13.480422500000003,,
87,test_env,1,gpu,CNN,0.012622800000002599,0.004626399999999364,0.0071554999999960955,0.004742200000002583,,0.008606799999995474
88,test_env,1,gpu,DNN,0.0010876999999993586,0.0010305000000059295,0.0009809999999959018,0.0008336999999940531,,0.017260699999994245
89,test_env,1,gpu,LSTM,0.015427700000003597,0.015543999999998448,0.018407000000003393,0.020451100000002498,0.02110169999999556,


In [8]:
# ls_data.to_csv("data/large_sample_data.csv")