In [1]:
import pandas as pd
import numpy as np
import datetime
import json
import os
import itertools
import glob
import copy
import warnings


out_dir="./log/"


def replace_str(target):
    target = target.replace('\n', '')
    target = target.replace(',', '')
    return target

def find_value(arr, target, jumpto=1):
    try:
        num = replace_str(arr[arr.index(target)+jumpto])
    except:
        raise ValueError(
            f'{arr}, {target}')
    return num

with open('./sim_conf.json') as json_file:
    conf = json.load(json_file)


# TODO: fix directory finding as input
# origin_filename = "fsNprepNloadNtrain_Imagenet_default_resnet50_epoch5_b2048_worker12_thread0"
# origin_filename = "fsNprepNloadNtrain_Imagenet_randaugment_resnet50_epoch5_b2048_worker12_thread0"

origin_filename = conf["FILE"]
io_filename = conf["IOFILE"]
# origin_filename = "fsNprepNloadNtrain_openimage_default_resnet50_epoch1_b2048_worker12_thread0"

simp_filename = f'{origin_filename}_simp'
fetch_filename = f"{simp_filename}_fetchdifftime"
startdiff_filename = f"{simp_filename}_fetchstartdifftime"
parse_dir = "../dsanalyzer_parsed/DDP4GPUFULLTRACE/{suffix}/{filename}.csv"

simp_datafile = parse_dir.format(suffix="simp", filename= simp_filename)
fetch_datafile = parse_dir.format(suffix="simp", filename= fetch_filename)
startdiff_datafile = parse_dir.format(suffix="simp", filename= startdiff_filename)

io_simp_filename = f'{io_filename}_simp'
io_fetch_filename = f"{io_simp_filename}_fetchdifftime"
io_startdiff_filename = f"{io_simp_filename}_fetchstartdifftime"

io_simp_datafile = parse_dir.format(suffix="simp", filename= io_simp_filename)
io_fetch_datafile = parse_dir.format(suffix="simp", filename= io_fetch_filename)
io_startdiff_datafile = parse_dir.format(suffix="simp", filename= io_startdiff_filename)

simp_df = pd.read_csv(simp_datafile, index_col=0)
fetch_df = pd.read_csv(fetch_datafile, index_col=0)
start_df = pd.read_csv(startdiff_datafile, index_col=0)

io_simp_df = pd.read_csv(io_simp_datafile, index_col=0)
io_fetch_df = pd.read_csv(io_fetch_datafile, index_col=0)
io_start_df = pd.read_csv(io_startdiff_datafile, index_col=0)

In [2]:
# Train type and setup
info = simp_datafile.split('/')

trainType = find_value(info, "dsanalyzer_parsed")

if trainType.find("2GPU") != -1: 
    gpu_num = 2
if trainType.find("4GPU") != -1:
    gpu_num = 4
elif trainType.find("DDP") != -1:
    gpu_num = 8
else:
    gpu_num = 1


# Fetch time align

In [3]:
# Make gpu columns for simulation
gpu_start_col = []
gpu_fetch_col = []
gpu_fetch_done_col = []
gpu_training_stall_col = []
gpu_training_col = []
gpu_training_time_col = []
gpu_pure_training_col = []

single_gpu_prefetch_count = []
train_col= ["Epoch", "Index number"]

for i in range(gpu_num):
    gpu_start_col.append(f"Start time_gpu{i}")
    gpu_fetch_col.append(f"Fetch time (sec)_gpu{i}")
    gpu_fetch_done_col.append(f"Fetch done time (sec)_gpu{i}")
    gpu_training_stall_col.append(f"Training Stall time (sec)_gpu{i}")
    gpu_training_col.append(f"Training start time_gpu{i}")
    gpu_training_time_col.append(f"Iteration time (sec)_gpu{i}")
    gpu_pure_training_col.append(f"Pure training_gpu{i}")
    start_df[gpu_start_col[i]] = pd.to_datetime(
                                    start_df[gpu_start_col[i]], format='%Y-%m-%d %H:%M:%S.%f', errors='ignore')
    io_start_df[gpu_start_col[i]] = pd.to_datetime(
                                    io_start_df[gpu_start_col[i]], format='%Y-%m-%d %H:%M:%S.%f', errors='ignore')

# Init simulation informations
simulation_df = pd.DataFrame()

simulation_df[train_col] = fetch_df[train_col]

simulation_df = simulation_df.merge(right=fetch_df[train_col+gpu_fetch_col], 
                                    on = train_col)
simulation_df = simulation_df.merge(right=start_df[train_col+gpu_start_col], 
                                    on = train_col)

simulation_df["Min fetch time (sec)"] = fetch_df[gpu_fetch_col].min(axis=1)
simulation_df["Avg fetch time (sec)"] = fetch_df[gpu_fetch_col].mean(axis=1)
simulation_df["Min start time"] = start_df[gpu_start_col].min(axis=1)

simulation_df.sort_values(by=train_col, inplace = True)
simulation_df.reset_index(inplace = True, drop = True)
simulation_df = simulation_df[simulation_df["Epoch"] == 1]

for i in range(gpu_num):
    simulation_df[gpu_fetch_done_col[i]] = simulation_df[gpu_start_col[i]] + pd.to_timedelta(simulation_df[gpu_fetch_col[i]], 's')
    
max_index_number = simulation_df["Index number"].max()


# Init simulation informations
io_df = pd.DataFrame()

io_df[train_col] = io_fetch_df[train_col]

io_df = io_df.merge(right=io_fetch_df[train_col+gpu_fetch_col], 
                                    on = train_col)
io_df = io_df.merge(right=io_start_df[train_col+gpu_start_col], 
                                    on = train_col)

io_df["Min fetch time (sec)"] = io_fetch_df[gpu_fetch_col].min(axis=1)
io_df["Avg fetch time (sec)"] = io_fetch_df[gpu_fetch_col].mean(axis=1)
io_df["Min start time"] = io_start_df[gpu_start_col].min(axis=1)

io_df.sort_values(by=train_col, inplace = True)
io_df.reset_index(inplace = True, drop = True)
io_df = io_df[io_df["Epoch"] == 1]

for i in range(gpu_num):
    io_df[gpu_fetch_done_col[i]] = io_df[gpu_start_col[i]] + pd.to_timedelta(io_df[gpu_fetch_col[i]], 's')
    
max_index_number = io_df["Index number"].max()


In [4]:
simulation_df

Unnamed: 0,Epoch,Index number,Fetch time (sec)_gpu0,Fetch time (sec)_gpu1,Fetch time (sec)_gpu2,Fetch time (sec)_gpu3,Start time_gpu0,Start time_gpu1,Start time_gpu2,Start time_gpu3,Min fetch time (sec),Avg fetch time (sec),Min start time,Fetch done time (sec)_gpu0,Fetch done time (sec)_gpu1,Fetch done time (sec)_gpu2,Fetch done time (sec)_gpu3
0,1,0,8.112918,8.386826,8.087636,8.168932,2021-09-01 19:48:46.728082,2021-09-01 19:48:46.722174,2021-09-01 19:48:46.727364,2021-09-01 19:48:46.727068,8.087636,8.189078,2021-09-01 19:48:46.722174,2021-09-01 19:48:54.841000319,2021-09-01 19:48:55.108999757,2021-09-01 19:48:54.814999759,2021-09-01 19:48:54.895999750
1,1,1,8.426491,8.199841,8.215275,8.266806,2021-09-01 19:48:46.727509,2021-09-01 19:48:46.722159,2021-09-01 19:48:46.727725,2021-09-01 19:48:46.727194,8.199841,8.277103,2021-09-01 19:48:46.722159,2021-09-01 19:48:55.154000052,2021-09-01 19:48:54.922000312,2021-09-01 19:48:54.942999923,2021-09-01 19:48:54.994000042
2,1,2,8.220429,8.060581,11.568502,8.332884,2021-09-01 19:48:46.727571,2021-09-01 19:48:46.722419,2021-09-01 19:48:46.727498,2021-09-01 19:48:46.727116,8.060581,9.045599,2021-09-01 19:48:46.722419,2021-09-01 19:48:54.948000301,2021-09-01 19:48:54.783000278,2021-09-01 19:48:58.295999747,2021-09-01 19:48:55.060000074
3,1,3,8.567206,8.384353,8.628927,8.750336,2021-09-01 19:48:55.044794,2021-09-01 19:48:55.326647,2021-09-01 19:48:55.015073,2021-09-01 19:48:55.105664,8.384353,8.582705,2021-09-01 19:48:55.015073,2021-09-01 19:49:03.612000137,2021-09-01 19:49:03.710999975,2021-09-01 19:49:03.643999721,2021-09-01 19:49:03.855999751
4,1,4,8.976840,9.103020,8.669292,8.500594,2021-09-01 19:48:55.374160,2021-09-01 19:48:55.129980,2021-09-01 19:48:55.147708,2021-09-01 19:48:55.201406,8.500594,8.812437,2021-09-01 19:48:55.129980,2021-09-01 19:49:04.351000264,2021-09-01 19:49:04.233000375,2021-09-01 19:49:03.817000187,2021-09-01 19:49:03.702000102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,1,846,8.700867,8.742307,8.309392,8.876083,2021-09-01 20:32:38.071133,2021-09-01 20:32:35.288693,2021-09-01 20:32:42.396608,2021-09-01 20:32:35.873917,8.309392,8.657162,2021-09-01 20:32:35.288693,2021-09-01 20:32:46.772000408,2021-09-01 20:32:44.031000186,2021-09-01 20:32:50.705999790,2021-09-01 20:32:44.749999610
847,1,847,8.788954,8.310897,8.926671,8.654566,2021-09-01 20:32:36.299046,2021-09-01 20:32:42.242103,2021-09-01 20:32:36.710329,2021-09-01 20:32:36.001434,8.310897,8.670272,2021-09-01 20:32:36.001434,2021-09-01 20:32:45.088000452,2021-09-01 20:32:50.552999554,2021-09-01 20:32:45.636999536,2021-09-01 20:32:44.655999743
848,1,848,8.836856,7.349450,8.761064,8.692516,2021-09-01 20:32:39.316144,2021-09-01 20:32:52.088550,2021-09-01 20:32:37.389936,2021-09-01 20:32:36.898484,7.349450,8.409971,2021-09-01 20:32:36.898484,2021-09-01 20:32:48.152999778,2021-09-01 20:32:59.438000491,2021-09-01 20:32:46.151000104,2021-09-01 20:32:45.590999536
849,1,849,8.056848,8.511557,7.986414,8.299552,2021-09-01 20:32:46.992152,2021-09-01 20:32:44.249443,2021-09-01 20:32:50.900586,2021-09-01 20:32:44.983448,7.986414,8.213593,2021-09-01 20:32:44.249443,2021-09-01 20:32:55.048999765,2021-09-01 20:32:52.760999578,2021-09-01 20:32:58.886999697,2021-09-01 20:32:53.283000161


In [5]:
io_df

Unnamed: 0,Epoch,Index number,Fetch time (sec)_gpu0,Fetch time (sec)_gpu1,Fetch time (sec)_gpu2,Fetch time (sec)_gpu3,Start time_gpu0,Start time_gpu1,Start time_gpu2,Start time_gpu3,Min fetch time (sec),Avg fetch time (sec),Min start time,Fetch done time (sec)_gpu0,Fetch done time (sec)_gpu1,Fetch done time (sec)_gpu2,Fetch done time (sec)_gpu3
0,1,0,8.089684,8.223212,8.183165,8.042392,2021-09-01 14:42:08.165316,2021-09-01 14:42:08.169788,2021-09-01 14:42:08.159835,2021-09-01 14:42:08.155608,8.042392,8.134613,2021-09-01 14:42:08.155608,2021-09-01 14:42:16.254999627,2021-09-01 14:42:16.393000288,2021-09-01 14:42:16.343000344,2021-09-01 14:42:16.198000490
1,1,1,8.375083,8.106987,8.160603,8.327214,2021-09-01 14:42:08.141917,2021-09-01 14:42:08.173013,2021-09-01 14:42:08.178397,2021-09-01 14:42:08.170786,8.106987,8.242472,2021-09-01 14:42:08.141917,2021-09-01 14:42:16.517000471,2021-09-01 14:42:16.279999851,2021-09-01 14:42:16.338999871,2021-09-01 14:42:16.498000497
2,1,2,8.047440,8.003416,14.615137,8.182197,2021-09-01 14:42:08.168560,2021-09-01 14:42:08.170584,2021-09-01 14:42:08.174863,2021-09-01 14:42:08.172803,8.003416,9.712048,2021-09-01 14:42:08.168560,2021-09-01 14:42:16.216000244,2021-09-01 14:42:16.174000318,2021-09-01 14:42:22.790000077,2021-09-01 14:42:16.355000047
3,1,3,8.471639,8.223656,8.489826,8.667492,2021-09-01 14:42:16.640361,2021-09-01 14:42:16.752344,2021-09-01 14:42:16.841174,2021-09-01 14:42:16.581508,8.223656,8.463153,2021-09-01 14:42:16.581508,2021-09-01 14:42:25.112000328,2021-09-01 14:42:24.975999889,2021-09-01 14:42:25.330999745,2021-09-01 14:42:25.249000479
4,1,4,8.848523,14.920232,8.553010,8.565763,2021-09-01 14:42:17.161477,2021-09-01 14:42:16.658768,2021-09-01 14:42:16.739990,2021-09-01 14:42:16.878237,8.553010,10.221882,2021-09-01 14:42:16.658768,2021-09-01 14:42:26.010000103,2021-09-01 14:42:31.579000203,2021-09-01 14:42:25.292999593,2021-09-01 14:42:25.444000253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,1,846,8.587554,8.515278,7.870457,8.724831,2021-09-01 15:27:21.866446,2021-09-01 15:27:27.484722,2021-09-01 15:27:35.759543,2021-09-01 15:27:28.758169,7.870457,8.424530,2021-09-01 15:27:21.866446,2021-09-01 15:27:30.454000083,2021-09-01 15:27:35.999999687,2021-09-01 15:27:43.630000231,2021-09-01 15:27:37.483000373
847,1,847,8.637323,8.450207,8.766960,8.591708,2021-09-01 15:27:29.308677,2021-09-01 15:27:28.725793,2021-09-01 15:27:29.879040,2021-09-01 15:27:28.479292,8.450207,8.611550,2021-09-01 15:27:28.479292,2021-09-01 15:27:37.946000259,2021-09-01 15:27:37.176000454,2021-09-01 15:27:38.646000333,2021-09-01 15:27:37.070999518
848,1,848,8.610526,6.996800,8.629963,8.533362,2021-09-01 15:27:30.260474,2021-09-01 15:27:44.105200,2021-09-01 15:27:29.848037,2021-09-01 15:27:29.523638,6.996800,8.192663,2021-09-01 15:27:29.523638,2021-09-01 15:27:38.870999732,2021-09-01 15:27:51.102000427,2021-09-01 15:27:38.477999693,2021-09-01 15:27:38.057000248
849,1,849,8.532738,8.069496,7.442202,7.868213,2021-09-01 15:27:30.839262,2021-09-01 15:27:36.475504,2021-09-01 15:27:44.019798,2021-09-01 15:27:37.919787,7.442202,7.978162,2021-09-01 15:27:30.839262,2021-09-01 15:27:39.372000332,2021-09-01 15:27:44.545000222,2021-09-01 15:27:51.461999995,2021-09-01 15:27:45.788000277


In [6]:
out=simulation_df[gpu_fetch_col]-io_df[gpu_fetch_col]
out

Unnamed: 0,Fetch time (sec)_gpu0,Fetch time (sec)_gpu1,Fetch time (sec)_gpu2,Fetch time (sec)_gpu3
0,0.023235,0.163613,-0.095530,0.126539
1,0.051408,0.092854,0.054672,-0.060408
2,0.172989,0.057165,-3.046635,0.150687
3,0.095567,0.160697,0.139101,0.082843
4,0.128317,-5.817212,0.116283,-0.065169
...,...,...,...,...
846,0.113313,0.227029,0.438935,0.151251
847,0.151631,-0.139311,0.159710,0.062858
848,0.226330,0.352650,0.131101,0.159153
849,-0.475891,0.442060,0.544212,0.431339


In [7]:
out[out[gpu_fetch_col]<0].count(numeric_only=True)

Fetch time (sec)_gpu0    191
Fetch time (sec)_gpu1    198
Fetch time (sec)_gpu2    233
Fetch time (sec)_gpu3    197
dtype: int64

In [8]:
out.count(numeric_only=True)

Fetch time (sec)_gpu0    851
Fetch time (sec)_gpu1    851
Fetch time (sec)_gpu2    851
Fetch time (sec)_gpu3    851
dtype: int64

In [9]:
out[out[gpu_fetch_col]>0].mean()

Fetch time (sec)_gpu0    0.255947
Fetch time (sec)_gpu1    0.287583
Fetch time (sec)_gpu2    0.224345
Fetch time (sec)_gpu3    0.234986
dtype: float64

In [10]:
out[out[gpu_fetch_col]>0].mean().mean()

0.2507152839105789

In [11]:
simulation_df[gpu_fetch_col].mean()

Fetch time (sec)_gpu0    8.993216
Fetch time (sec)_gpu1    8.992541
Fetch time (sec)_gpu2    8.954758
Fetch time (sec)_gpu3    8.942960
dtype: float64

In [12]:
simulation_df[gpu_fetch_col].mean().mean()

8.970868762255867