In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [2]:
logs_path = Path("logs")

# Standalone

In [3]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_global_epochs_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_global_epochs_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_global_epochs_df = pd.concat(workers_global_epochs_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_global_epochs_df)
print(workers_events_pairs)
workers_global_epochs_df = compute_time_elapsed(workers_global_epochs_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_global_epochs_df)

[('start.epoch', 'end.epoch'), ('start.train', 'end.train'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:05:42.738041162,2024-05-14 22:05:43.281092167,epoch,0.543051,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),0
1,2024-05-14 22:05:43.281520844,2024-05-14 22:05:46.908773184,epoch,3.627252,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),1
2,2024-05-14 22:05:46.909579039,2024-05-14 22:05:46.991590977,epoch,0.082012,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),2
3,2024-05-14 22:05:46.994435072,2024-05-14 22:05:47.062954903,epoch,0.068520,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),3
4,2024-05-14 22:05:47.063235044,2024-05-14 22:05:47.138731003,epoch,0.075496,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),4
...,...,...,...,...,...,...,...
995,2024-05-14 22:08:11.006348133,2024-05-14 22:08:11.013747931,send,0.007400,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),995
996,2024-05-14 22:08:11.086536884,2024-05-14 22:08:11.096235037,send,0.009698,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),996
997,2024-05-14 22:08:11.161188126,2024-05-14 22:08:11.168251991,send,0.007064,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),997
998,2024-05-14 22:08:11.237879038,2024-05-14 22:08:11.245826006,send,0.007947,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),998


Unnamed: 0,epoch,start.epoch,end.epoch,start.train,end.train,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv,...,dataset,worker,log,world_size,time_elapsed.epoch,time_elapsed.train,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-14 22:05:42.738041162,2024-05-14 22:05:43.281092167,2024-05-14 22:05:42.842543125,2024-05-14 22:05:43.266789198,2024-05-14 22:05:42.744397163,2024-05-14 22:05:42.842543125,2024-05-14 22:05:43.266789198,2024-05-14 22:05:43.281076908,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.543051,0.424246,0.098146,0.014288,,
1,1,2024-05-14 22:05:43.281520844,2024-05-14 22:05:46.908773184,2024-05-14 22:05:46.865521908,2024-05-14 22:05:46.902816057,2024-05-14 22:05:43.284696102,2024-05-14 22:05:46.865521908,2024-05-14 22:05:46.902817011,2024-05-14 22:05:46.908612013,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,3.627252,0.037294,3.580826,0.005795,,
2,2,2024-05-14 22:05:46.909579039,2024-05-14 22:05:46.991590977,2024-05-14 22:05:46.964815140,2024-05-14 22:05:46.982262135,2024-05-14 22:05:46.912746906,2024-05-14 22:05:46.964814186,2024-05-14 22:05:46.982262135,2024-05-14 22:05:46.991578102,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.082012,0.017447,0.052067,0.009316,,
3,3,2024-05-14 22:05:46.994435072,2024-05-14 22:05:47.062954903,2024-05-14 22:05:47.039069891,2024-05-14 22:05:47.056173086,2024-05-14 22:05:46.996181965,2024-05-14 22:05:47.039068937,2024-05-14 22:05:47.056173086,2024-05-14 22:05:47.062943935,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.068520,0.017103,0.042887,0.006771,,
4,4,2024-05-14 22:05:47.063235044,2024-05-14 22:05:47.138731003,2024-05-14 22:05:47.112349987,2024-05-14 22:05:47.129127979,2024-05-14 22:05:47.064936876,2024-05-14 22:05:47.112349987,2024-05-14 22:05:47.129127979,2024-05-14 22:05:47.138672113,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.075496,0.016778,0.047413,0.009544,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:08:10.954139948,2024-05-14 22:08:11.013767958,2024-05-14 22:08:10.991689920,2024-05-14 22:08:11.006346941,2024-05-14 22:08:10.956226110,2024-05-14 22:08:10.991688013,2024-05-14 22:08:11.006348133,2024-05-14 22:08:11.013747931,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.059628,0.014657,0.035462,0.007400,,
996,996,2024-05-14 22:08:11.030403137,2024-05-14 22:08:11.096248150,2024-05-14 22:08:11.070521116,2024-05-14 22:08:11.086536884,2024-05-14 22:08:11.032202959,2024-05-14 22:08:11.070520163,2024-05-14 22:08:11.086536884,2024-05-14 22:08:11.096235037,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.065845,0.016016,0.038317,0.009698,,
997,997,2024-05-14 22:08:11.111308098,2024-05-14 22:08:11.168266058,2024-05-14 22:08:11.145148993,2024-05-14 22:08:11.161187172,2024-05-14 22:08:11.112967968,2024-05-14 22:08:11.145148993,2024-05-14 22:08:11.161188126,2024-05-14 22:08:11.168251991,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.056958,0.016038,0.032181,0.007064,,
998,998,2024-05-14 22:08:11.183435917,2024-05-14 22:08:11.245840788,2024-05-14 22:08:11.221864939,2024-05-14 22:08:11.237879038,2024-05-14 22:08:11.186115026,2024-05-14 22:08:11.221864939,2024-05-14 22:08:11.237879038,2024-05-14 22:08:11.245826006,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.062405,0.016014,0.035750,0.007947,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.send_data', 'end.send_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:05:42.724406004,2024-05-14 22:05:46.832384109,epoch,4.107978,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),0
1,2024-05-14 22:05:46.832906008,2024-05-14 22:05:46.939491034,epoch,0.106585,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),1
2,2024-05-14 22:05:46.940001011,2024-05-14 22:05:47.004346848,epoch,0.064346,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),2
3,2024-05-14 22:05:47.004744053,2024-05-14 22:05:47.076805830,epoch,0.072062,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),3
4,2024-05-14 22:05:47.077163935,2024-05-14 22:05:47.151941776,epoch,0.074778,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),4
...,...,...,...,...,...,...,...
800,2024-05-14 22:07:41.188825130,2024-05-14 22:07:41.645806074,is,0.456981,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),800
850,2024-05-14 22:07:48.711030960,2024-05-14 22:07:49.201648235,is,0.490617,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),850
900,2024-05-14 22:07:55.993383884,2024-05-14 22:07:56.472887993,is,0.479504,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),900
950,2024-05-14 22:08:03.734187126,2024-05-14 22:08:04.175956964,is,0.441770,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),950


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,...,dataset,world_size,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 22:05:42.724406004,2024-05-14 22:05:46.832384109,2024-05-14 22:05:42.724406004,2024-05-14 22:05:43.558881044,2024-05-14 22:05:42.826583147,2024-05-14 22:05:43.281902075,2024-05-14 22:05:43.281903028,2024-05-14 22:05:43.408430099,2024-05-14 22:05:43.408604860,...,server,4,logs/mdgan.4.cifar.server.logs.json,4.107978,0.834475,0.455319,0.126527,0.150275,2.820525,0.418615
1,1,2024-05-14 22:05:46.832906008,2024-05-14 22:05:46.939491034,2024-05-14 22:05:46.832906008,2024-05-14 22:05:46.939490080,2024-05-14 22:05:46.836228132,2024-05-14 22:05:46.914565802,2024-05-14 22:05:46.914566994,2024-05-14 22:05:46.926650763,2024-05-14 22:05:46.926774979,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.106585,0.106584,0.078338,0.012084,0.012714,,
2,2,2024-05-14 22:05:46.940001011,2024-05-14 22:05:47.004346848,2024-05-14 22:05:46.940001011,2024-05-14 22:05:47.004346848,2024-05-14 22:05:46.941838026,2024-05-14 22:05:46.991953850,2024-05-14 22:05:46.991955042,2024-05-14 22:05:47.001419067,2024-05-14 22:05:47.001538038,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.064346,0.064346,0.050116,0.009464,0.002808,,
3,3,2024-05-14 22:05:47.004744053,2024-05-14 22:05:47.076805830,2024-05-14 22:05:47.004744053,2024-05-14 22:05:47.076805115,2024-05-14 22:05:47.006289959,2024-05-14 22:05:47.065474987,2024-05-14 22:05:47.065476178,2024-05-14 22:05:47.073930025,2024-05-14 22:05:47.074024916,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.072062,0.072061,0.059185,0.008454,0.002780,,
4,4,2024-05-14 22:05:47.077163935,2024-05-14 22:05:47.151941776,2024-05-14 22:05:47.077163935,2024-05-14 22:05:47.151941061,2024-05-14 22:05:47.078397989,2024-05-14 22:05:47.139221907,2024-05-14 22:05:47.139223099,2024-05-14 22:05:47.149017096,2024-05-14 22:05:47.149132013,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.074778,0.074777,0.060824,0.009794,0.002808,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:08:10.970428944,2024-05-14 22:08:11.033056974,2024-05-14 22:08:10.970428944,2024-05-14 22:08:11.033056020,2024-05-14 22:08:10.971832037,2024-05-14 22:08:11.019113064,2024-05-14 22:08:11.019114017,2024-05-14 22:08:11.030361176,2024-05-14 22:08:11.030448914,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.062628,0.062627,0.047281,0.011247,0.002607,,
996,996,2024-05-14 22:08:11.048902988,2024-05-14 22:08:11.109341860,2024-05-14 22:08:11.048902988,2024-05-14 22:08:11.109338999,2024-05-14 22:08:11.050730944,2024-05-14 22:08:11.095983028,2024-05-14 22:08:11.095984221,2024-05-14 22:08:11.105937004,2024-05-14 22:08:11.106181860,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.060439,0.060436,0.045252,0.009953,0.003156,,
997,997,2024-05-14 22:08:11.126417875,2024-05-14 22:08:11.185451031,2024-05-14 22:08:11.126417875,2024-05-14 22:08:11.185450077,2024-05-14 22:08:11.127881050,2024-05-14 22:08:11.170811892,2024-05-14 22:08:11.170814037,2024-05-14 22:08:11.182792902,2024-05-14 22:08:11.182883978,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.059033,0.059032,0.042931,0.011979,0.002565,,
998,998,2024-05-14 22:08:11.201513052,2024-05-14 22:08:11.260336876,2024-05-14 22:08:11.201513052,2024-05-14 22:08:11.260336161,2024-05-14 22:08:11.203176022,2024-05-14 22:08:11.247613907,2024-05-14 22:08:11.247614861,2024-05-14 22:08:11.257567883,2024-05-14 22:08:11.257663012,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.058824,0.058823,0.044438,0.009953,0.002673,,


In [6]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))
logs_standalones

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: -1 days +23:56:47.360464811
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.is,fid,is,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 22:05:42.724405193,2024-05-14 22:05:46.912676954,2024-05-14 22:05:42.724405193,2024-05-14 22:05:43.144715928,0,1.472082,2.691529,2024-05-14 22:05:43.144715928,NaT,...,2024-05-14 22:05:46.912641906,427.379852,1.097491,cifar,cifar.standalone.logs.json,4.188272,0.420311,,3.187721,0.563966
1,1,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976365947,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976363801,1,1.784279,2.096194,2024-05-14 22:05:46.976363801,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.032219,0.032217,,,
2,2,2024-05-14 22:05:47.002261781,2024-05-14 22:05:47.023092174,2024-05-14 22:05:47.002262973,2024-05-14 22:05:47.023089074,2,1.396755,2.365873,2024-05-14 22:05:47.023090028,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.020830,0.020826,,,
3,3,2024-05-14 22:05:47.051220083,2024-05-14 22:05:47.073914909,2024-05-14 22:05:47.051221036,2024-05-14 22:05:47.073912048,3,1.701418,2.404415,2024-05-14 22:05:47.073913001,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.022695,0.022691,,,
4,4,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127109908,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127107047,4,1.705129,2.500046,2024-05-14 22:05:47.127107047,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.024832,0.024829,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:08:06.839388036,2024-05-14 22:08:06.869122886,2024-05-14 22:08:06.839388036,2024-05-14 22:08:06.869119070,995,0.500290,3.374500,2024-05-14 22:08:06.869120024,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.029735,0.029731,,,
996,996,2024-05-14 22:08:06.918587112,2024-05-14 22:08:06.955421113,2024-05-14 22:08:06.918588065,2024-05-14 22:08:06.955417060,996,0.727101,2.680925,2024-05-14 22:08:06.955418014,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.036834,0.036829,,,
997,997,2024-05-14 22:08:06.995272064,2024-05-14 22:08:07.016181850,2024-05-14 22:08:06.995272064,2024-05-14 22:08:07.016180181,997,1.054406,1.746657,2024-05-14 22:08:07.016180181,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.020910,0.020908,,,
998,998,2024-05-14 22:08:07.055657052,2024-05-14 22:08:07.082235002,2024-05-14 22:08:07.055658006,2024-05-14 22:08:07.082231902,998,1.092340,1.281015,2024-05-14 22:08:07.082231902,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.026578,0.026574,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:05:42.724405193,2024-05-14 22:05:46.912676954,epoch,4.188272,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976365947,epoch,0.032219,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 22:05:47.002261781,2024-05-14 22:05:47.023092174,epoch,0.020830,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 22:05:47.051220083,2024-05-14 22:05:47.073914909,epoch,0.022695,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127109908,epoch,0.024832,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
750,2024-05-14 22:07:32.163448953,2024-05-14 22:07:32.726762199,is,0.563313,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),750
800,2024-05-14 22:07:39.834799909,2024-05-14 22:07:40.376342916,is,0.541543,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),800
850,2024-05-14 22:07:47.708835029,2024-05-14 22:07:48.244465970,is,0.535631,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),850
900,2024-05-14 22:07:55.577649020,2024-05-14 22:07:56.103894853,is,0.526246,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),900


In [7]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_global_epochs_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:05:42.724405193,2024-05-14 22:05:46.912676954,epoch,4.188272,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976365947,epoch,0.032219,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 22:05:47.002261781,2024-05-14 22:05:47.023092174,epoch,0.020830,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 22:05:47.051220083,2024-05-14 22:05:47.073914909,epoch,0.022695,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127109908,epoch,0.024832,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
800,2024-05-14 22:07:41.188825130,2024-05-14 22:07:41.645806074,is,0.456981,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),800
850,2024-05-14 22:07:48.711030960,2024-05-14 22:07:49.201648235,is,0.490617,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),850
900,2024-05-14 22:07:55.993383884,2024-05-14 22:07:56.472887993,is,0.479504,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),900
950,2024-05-14 22:08:03.734187126,2024-05-14 22:08:04.175956964,is,0.441770,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),950


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,time_elapsed.swap_send,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,end.apply_gradients,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients
0,0,2024-05-14 22:05:42.724405193,2024-05-14 22:05:46.912676954,2024-05-14 22:05:42.724405193,2024-05-14 22:05:43.144715928,0.0,1.472082,2.691529,2024-05-14 22:05:43.144715928,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
1,1,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976365947,2024-05-14 22:05:46.944147014,2024-05-14 22:05:46.976363801,1.0,1.784279,2.096194,2024-05-14 22:05:46.976363801,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
2,2,2024-05-14 22:05:47.002261781,2024-05-14 22:05:47.023092174,2024-05-14 22:05:47.002262973,2024-05-14 22:05:47.023089074,2.0,1.396755,2.365873,2024-05-14 22:05:47.023090028,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,3,2024-05-14 22:05:47.051220083,2024-05-14 22:05:47.073914909,2024-05-14 22:05:47.051221036,2024-05-14 22:05:47.073912048,3.0,1.701418,2.404415,2024-05-14 22:05:47.073913001,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,4,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127109908,2024-05-14 22:05:47.102277898,2024-05-14 22:05:47.127107047,4.0,1.705129,2.500046,2024-05-14 22:05:47.127107047,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:08:10.970428944,2024-05-14 22:08:11.033056974,2024-05-14 22:08:10.970428944,2024-05-14 22:08:11.033056020,,,,NaT,NaT,...,,2024-05-14 22:08:10.971832037,2024-05-14 22:08:11.019113064,2024-05-14 22:08:11.019114017,2024-05-14 22:08:11.030361176,2024-05-14 22:08:11.030448914,2024-05-14 22:08:11.033056020,0.047281,0.011247,0.002607
996,996,2024-05-14 22:08:11.048902988,2024-05-14 22:08:11.109341860,2024-05-14 22:08:11.048902988,2024-05-14 22:08:11.109338999,,,,NaT,NaT,...,,2024-05-14 22:08:11.050730944,2024-05-14 22:08:11.095983028,2024-05-14 22:08:11.095984221,2024-05-14 22:08:11.105937004,2024-05-14 22:08:11.106181860,2024-05-14 22:08:11.109338045,0.045252,0.009953,0.003156
997,997,2024-05-14 22:08:11.126417875,2024-05-14 22:08:11.185451031,2024-05-14 22:08:11.126417875,2024-05-14 22:08:11.185450077,,,,NaT,NaT,...,,2024-05-14 22:08:11.127881050,2024-05-14 22:08:11.170811892,2024-05-14 22:08:11.170814037,2024-05-14 22:08:11.182792902,2024-05-14 22:08:11.182883978,2024-05-14 22:08:11.185448885,0.042931,0.011979,0.002565
998,998,2024-05-14 22:08:11.201513052,2024-05-14 22:08:11.260336876,2024-05-14 22:08:11.201513052,2024-05-14 22:08:11.260336161,,,,NaT,NaT,...,,2024-05-14 22:08:11.203176022,2024-05-14 22:08:11.247613907,2024-05-14 22:08:11.247614861,2024-05-14 22:08:11.257567883,2024-05-14 22:08:11.257663012,2024-05-14 22:08:11.260336161,0.044438,0.009953,0.002673


In [8]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [9]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [16]:
mean_time_elapsed = workers_events_df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="event", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="event", title="Mean time elapsed", template="plotly_white").show()

In [10]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
)

timeline.show()