In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [2]:
logs_path = Path("logs")

# Standalone

In [3]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_global_epochs_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_global_epochs_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_global_epochs_df = pd.concat(workers_global_epochs_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_global_epochs_df)
print(workers_events_pairs)
workers_global_epochs_df = compute_time_elapsed(workers_global_epochs_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_global_epochs_df)

[('start.epoch', 'end.epoch'), ('start.train', 'end.train'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 16:43:29.831662178,2024-05-14 16:43:30.405434132,epoch,0.573772,mdgan.5.cifar.worker.1.logs.json,epoch (logs/mdgan.5.cifar.worker.1.logs.json),0
1,2024-05-14 16:43:30.408802986,2024-05-14 16:43:33.759947062,epoch,3.351144,mdgan.5.cifar.worker.1.logs.json,epoch (logs/mdgan.5.cifar.worker.1.logs.json),1
2,2024-05-14 16:43:33.760489941,2024-05-14 16:43:33.869878054,epoch,0.109388,mdgan.5.cifar.worker.1.logs.json,epoch (logs/mdgan.5.cifar.worker.1.logs.json),2
3,2024-05-14 16:43:33.870329142,2024-05-14 16:43:33.956831932,epoch,0.086503,mdgan.5.cifar.worker.1.logs.json,epoch (logs/mdgan.5.cifar.worker.1.logs.json),3
4,2024-05-14 16:43:33.957221985,2024-05-14 16:43:34.048259020,epoch,0.091037,mdgan.5.cifar.worker.1.logs.json,epoch (logs/mdgan.5.cifar.worker.1.logs.json),4
...,...,...,...,...,...,...,...
22,2024-05-14 16:43:35.660019875,2024-05-14 16:43:35.668193102,send,0.008173,mdgan.5.cifar.worker.5.logs.json,send (logs/mdgan.5.cifar.worker.5.logs.json),22
23,2024-05-14 16:43:35.750016928,2024-05-14 16:43:35.759333134,send,0.009316,mdgan.5.cifar.worker.5.logs.json,send (logs/mdgan.5.cifar.worker.5.logs.json),23
24,2024-05-14 16:43:35.842810154,2024-05-14 16:43:35.850170851,send,0.007361,mdgan.5.cifar.worker.5.logs.json,send (logs/mdgan.5.cifar.worker.5.logs.json),24
25,2024-05-14 16:43:35.932924986,2024-05-14 16:43:35.938212156,send,0.005287,mdgan.5.cifar.worker.5.logs.json,send (logs/mdgan.5.cifar.worker.5.logs.json),25


Unnamed: 0,epoch,start.epoch,end.epoch,start.train,end.train,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv,...,dataset,worker,log,world_size,time_elapsed.epoch,time_elapsed.train,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-14 16:43:29.831662178,2024-05-14 16:43:30.405434132,2024-05-14 16:43:29.961426020,2024-05-14 16:43:30.385761976,2024-05-14 16:43:29.839945078,2024-05-14 16:43:29.961426020,2024-05-14 16:43:30.385761976,2024-05-14 16:43:30.405089140,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.1.logs.json,5,0.573772,0.424336,0.121481,0.019327,,
1,1,2024-05-14 16:43:30.408802986,2024-05-14 16:43:33.759947062,2024-05-14 16:43:33.702067852,2024-05-14 16:43:33.747936964,2024-05-14 16:43:30.411409855,2024-05-14 16:43:33.702066898,2024-05-14 16:43:33.747938156,2024-05-14 16:43:33.759933949,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.1.logs.json,5,3.351144,0.045869,3.290657,0.011996,,
2,2,2024-05-14 16:43:33.760489941,2024-05-14 16:43:33.869878054,2024-05-14 16:43:33.837297916,2024-05-14 16:43:33.860669851,2024-05-14 16:43:33.762871027,2024-05-14 16:43:33.837296963,2024-05-14 16:43:33.860669851,2024-05-14 16:43:33.869861841,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.1.logs.json,5,0.109388,0.023372,0.074426,0.009192,,
3,3,2024-05-14 16:43:33.870329142,2024-05-14 16:43:33.956831932,2024-05-14 16:43:33.925778866,2024-05-14 16:43:33.949967146,2024-05-14 16:43:33.872330904,2024-05-14 16:43:33.925778866,2024-05-14 16:43:33.949967146,2024-05-14 16:43:33.956817865,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.1.logs.json,5,0.086503,0.024188,0.053448,0.006851,,
4,4,2024-05-14 16:43:33.957221985,2024-05-14 16:43:34.048259020,2024-05-14 16:43:34.014600039,2024-05-14 16:43:34.039219856,2024-05-14 16:43:33.958865881,2024-05-14 16:43:34.014600039,2024-05-14 16:43:34.039221048,2024-05-14 16:43:34.048248053,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.1.logs.json,5,0.091037,0.024620,0.055734,0.009027,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,22,2024-05-14 16:43:35.582704782,2024-05-14 16:43:35.668207169,2024-05-14 16:43:35.640265942,2024-05-14 16:43:35.660019875,2024-05-14 16:43:35.584672928,2024-05-14 16:43:35.640264988,2024-05-14 16:43:35.660019875,2024-05-14 16:43:35.668193102,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.5.logs.json,5,0.085502,0.019754,0.055592,0.008173,,
23,23,2024-05-14 16:43:35.669106007,2024-05-14 16:43:35.759344816,2024-05-14 16:43:35.728389025,2024-05-14 16:43:35.750016928,2024-05-14 16:43:35.670753956,2024-05-14 16:43:35.728388071,2024-05-14 16:43:35.750016928,2024-05-14 16:43:35.759333134,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.5.logs.json,5,0.090239,0.021628,0.057634,0.009316,,
24,24,2024-05-14 16:43:35.759938955,2024-05-14 16:43:35.850188017,2024-05-14 16:43:35.818846941,2024-05-14 16:43:35.842809200,2024-05-14 16:43:35.761593103,2024-05-14 16:43:35.818845987,2024-05-14 16:43:35.842810154,2024-05-14 16:43:35.850170851,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.5.logs.json,5,0.090249,0.023962,0.057253,0.007361,,
25,25,2024-05-14 16:43:35.851063967,2024-05-14 16:43:35.938224792,2024-05-14 16:43:35.909811974,2024-05-14 16:43:35.932924986,2024-05-14 16:43:35.852933168,2024-05-14 16:43:35.909811020,2024-05-14 16:43:35.932924986,2024-05-14 16:43:35.938212156,NaT,...,worker,logs,logs/mdgan.5.cifar.worker.5.logs.json,5,0.087161,0.023113,0.056878,0.005287,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.logging', 'end.logging'), ('start.send_data', 'end.send_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 16:43:29.834114075,2024-05-14 16:43:33.670010090,epoch,3.835896,mdgan.5.cifar.server.logs.json,epoch (logs/mdgan.5.cifar.server.logs.json),0
1,2024-05-14 16:43:33.670328140,2024-05-14 16:43:33.791720867,epoch,0.121393,mdgan.5.cifar.server.logs.json,epoch (logs/mdgan.5.cifar.server.logs.json),1
2,2024-05-14 16:43:33.792194128,2024-05-14 16:43:33.883865118,epoch,0.091671,mdgan.5.cifar.server.logs.json,epoch (logs/mdgan.5.cifar.server.logs.json),2
3,2024-05-14 16:43:33.884233952,2024-05-14 16:43:33.973150969,epoch,0.088917,mdgan.5.cifar.server.logs.json,epoch (logs/mdgan.5.cifar.server.logs.json),3
4,2024-05-14 16:43:33.973498106,2024-05-14 16:43:34.062836884,epoch,0.089339,mdgan.5.cifar.server.logs.json,epoch (logs/mdgan.5.cifar.server.logs.json),4
...,...,...,...,...,...,...,...
23,2024-05-14 16:43:35.770938158,2024-05-14 16:43:35.773657084,apply_gradients,0.002719,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,23
24,2024-05-14 16:43:35.863605976,2024-05-14 16:43:35.866931915,apply_gradients,0.003326,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,24
25,2024-05-14 16:43:35.952594995,2024-05-14 16:43:35.956067085,apply_gradients,0.003472,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,25
0,2024-05-14 16:43:31.250539063,2024-05-14 16:43:33.650402069,fid,2.399863,mdgan.5.cifar.server.logs.json,fid (logs/mdgan.5.cifar.server.logs.json),0


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.logging,end.logging,start.send_data,end.send_data,start.calc_gradients,...,world_size,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.logging,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 16:43:29.834114075,2024-05-14 16:43:33.670010090,2024-05-14 16:43:29.834114075,2024-05-14 16:43:30.717369080,2024-05-14 16:43:30.717370033,2024-05-14 16:43:33.670009136,2024-05-14 16:43:29.940656900,2024-05-14 16:43:30.467525005,2024-05-14 16:43:30.467533112,...,5,logs/mdgan.5.cifar.server.logs.json,3.835896,0.883255,2.952639,0.526868,0.139373,0.110038,2.399863,0.524318
1,1,2024-05-14 16:43:33.670328140,2024-05-14 16:43:33.791720867,2024-05-14 16:43:33.670328140,2024-05-14 16:43:33.791719913,NaT,NaT,2024-05-14 16:43:33.673645020,2024-05-14 16:43:33.768641949,2024-05-14 16:43:33.768644094,...,5,logs/mdgan.5.cifar.server.logs.json,0.121393,0.121392,,0.094997,0.011996,0.01098,,
2,2,2024-05-14 16:43:33.792194128,2024-05-14 16:43:33.883865118,2024-05-14 16:43:33.792194128,2024-05-14 16:43:33.883864164,NaT,NaT,2024-05-14 16:43:33.793858051,2024-05-14 16:43:33.870413065,2024-05-14 16:43:33.870414019,...,5,logs/mdgan.5.cifar.server.logs.json,0.091671,0.09167,,0.076555,0.010483,0.002842,,
3,3,2024-05-14 16:43:33.884233952,2024-05-14 16:43:33.973150969,2024-05-14 16:43:33.884234905,2024-05-14 16:43:33.973150969,NaT,NaT,2024-05-14 16:43:33.885493040,2024-05-14 16:43:33.959959984,2024-05-14 16:43:33.959961176,...,5,logs/mdgan.5.cifar.server.logs.json,0.088917,0.088916,,0.074467,0.010445,0.002628,,
4,4,2024-05-14 16:43:33.973498106,2024-05-14 16:43:34.062836884,2024-05-14 16:43:33.973498106,2024-05-14 16:43:34.062835932,NaT,NaT,2024-05-14 16:43:33.974755049,2024-05-14 16:43:34.048677206,2024-05-14 16:43:34.048677921,...,5,logs/mdgan.5.cifar.server.logs.json,0.089339,0.089338,,0.073922,0.011235,0.002824,,
5,5,2024-05-14 16:43:34.064641953,2024-05-14 16:43:34.149254799,2024-05-14 16:43:34.064641953,2024-05-14 16:43:34.149254799,NaT,NaT,2024-05-14 16:43:34.065973042,2024-05-14 16:43:34.136707067,2024-05-14 16:43:34.136708021,...,5,logs/mdgan.5.cifar.server.logs.json,0.084613,0.084613,,0.070734,0.0098,0.002653,,
6,6,2024-05-14 16:43:34.149707079,2024-05-14 16:43:34.238437891,2024-05-14 16:43:34.149707079,2024-05-14 16:43:34.238437176,NaT,NaT,2024-05-14 16:43:34.150986910,2024-05-14 16:43:34.225231886,2024-05-14 16:43:34.225234032,...,5,logs/mdgan.5.cifar.server.logs.json,0.088731,0.08873,,0.074245,0.010394,0.002703,,
7,7,2024-05-14 16:43:34.238864899,2024-05-14 16:43:34.350847960,2024-05-14 16:43:34.238864899,2024-05-14 16:43:34.350847960,NaT,NaT,2024-05-14 16:43:34.240105867,2024-05-14 16:43:34.334059954,2024-05-14 16:43:34.334062099,...,5,logs/mdgan.5.cifar.server.logs.json,0.111983,0.111983,,0.093954,0.013758,0.002897,,
8,8,2024-05-14 16:43:34.351454973,2024-05-14 16:43:34.448472977,2024-05-14 16:43:34.351454973,2024-05-14 16:43:34.448472023,NaT,NaT,2024-05-14 16:43:34.352846146,2024-05-14 16:43:34.432783127,2024-05-14 16:43:34.432783842,...,5,logs/mdgan.5.cifar.server.logs.json,0.097018,0.097017,,0.079937,0.012486,0.003111,,
9,9,2024-05-14 16:43:34.449048996,2024-05-14 16:43:34.539127111,2024-05-14 16:43:34.449050188,2024-05-14 16:43:34.539127111,NaT,NaT,2024-05-14 16:43:34.450953007,2024-05-14 16:43:34.526329041,2024-05-14 16:43:34.526330948,...,5,logs/mdgan.5.cifar.server.logs.json,0.090078,0.090077,,0.075376,0.010002,0.002701,,


In [6]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))
logs_standalones

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: 0 days 00:54:13.300756932
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is'), ('start.logging', 'end.logging')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,start.logging,end.logging,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is,time_elapsed.logging
0,0,2024-05-14 16:43:29.834113143,2024-05-14 16:43:34.106158040,2024-05-14 16:43:29.834114097,2024-05-14 16:43:30.294969104,0,1.451478,20.996281,2024-05-14 16:43:30.294969104,NaT,...,2024-05-14 16:43:30.314437889,2024-05-14 16:43:34.106157087,cifar,cifar.standalone.logs.json,4.272045,0.460855,,3.247869,0.535638,3.791719
1,1,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182240985,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182239078,1,6.4014,25.901831,2024-05-14 16:43:34.182239078,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.04108,0.041078,,,,
2,2,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245211862,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245209955,2,2.631938,19.150515,2024-05-14 16:43:34.245210908,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.031666,0.031664,,,,
3,3,2024-05-14 16:43:34.282121919,2024-05-14 16:43:34.328813098,2024-05-14 16:43:34.282123111,2024-05-14 16:43:34.328810953,3,0.904639,13.880597,2024-05-14 16:43:34.328810953,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.046691,0.046688,,,,
4,4,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397956155,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397954009,4,0.113801,9.887267,2024-05-14 16:43:34.397955202,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.038423,0.038421,,,,
5,5,2024-05-14 16:43:34.426778816,2024-05-14 16:43:34.457152151,2024-05-14 16:43:34.426780008,2024-05-14 16:43:34.457150005,5,22.038605,16.248434,2024-05-14 16:43:34.457150958,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.030373,0.03037,,,,
6,6,2024-05-14 16:43:34.486900114,2024-05-14 16:43:34.520264886,2024-05-14 16:43:34.486900114,2024-05-14 16:43:34.520262979,6,0.621897,15.243013,2024-05-14 16:43:34.520262979,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.033365,0.033363,,,,
7,7,2024-05-14 16:43:34.553753160,2024-05-14 16:43:34.599918865,2024-05-14 16:43:34.553753160,2024-05-14 16:43:34.599916004,7,0.541735,5.127599,2024-05-14 16:43:34.599917196,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.046166,0.046163,,,,
8,8,2024-05-14 16:43:34.630193971,2024-05-14 16:43:34.666256927,2024-05-14 16:43:34.630193971,2024-05-14 16:43:34.666254781,8,49.022472,0.22374,2024-05-14 16:43:34.666254781,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.036063,0.036061,,,,
9,9,2024-05-14 16:43:34.698748849,2024-05-14 16:43:34.732837938,2024-05-14 16:43:34.698750041,2024-05-14 16:43:34.732835077,9,6.181591,12.467178,2024-05-14 16:43:34.732835792,NaT,...,NaT,NaT,cifar,cifar.standalone.logs.json,0.034089,0.034085,,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 16:43:29.834113143,2024-05-14 16:43:34.106158040,epoch,4.272045,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182240985,epoch,0.041080,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245211862,epoch,0.031666,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 16:43:34.282121919,2024-05-14 16:43:34.328813098,epoch,0.046691,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397956155,epoch,0.038423,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
30,2024-05-14 16:43:47.226526998,2024-05-14 16:43:47.766216062,is,0.539689,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),30
0,2024-05-14 16:43:30.314437889,2024-05-14 16:43:34.106157087,logging,3.791719,cifar.standalone.logs.json,logging (logs/cifar.standalone.logs.json),0
10,2024-05-14 16:43:34.809028886,2024-05-14 16:43:38.952338956,logging,4.143310,cifar.standalone.logs.json,logging (logs/cifar.standalone.logs.json),10
20,2024-05-14 16:43:39.595707916,2024-05-14 16:43:43.314371131,logging,3.718663,cifar.standalone.logs.json,logging (logs/cifar.standalone.logs.json),20


In [7]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_global_epochs_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 16:43:29.834113143,2024-05-14 16:43:34.106158040,epoch,4.272045,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182240985,epoch,0.041080,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245211862,epoch,0.031666,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 16:43:34.282121919,2024-05-14 16:43:34.328813098,epoch,0.046691,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397956155,epoch,0.038423,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
23,2024-05-14 16:43:35.770938158,2024-05-14 16:43:35.773657084,apply_gradients,0.002719,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,23
24,2024-05-14 16:43:35.863605976,2024-05-14 16:43:35.866931915,apply_gradients,0.003326,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,24
25,2024-05-14 16:43:35.952594995,2024-05-14 16:43:35.956067085,apply_gradients,0.003472,mdgan.5.cifar.server.logs.json,apply_gradients (logs/mdgan.5.cifar.server.log...,25
0,2024-05-14 16:43:31.250539063,2024-05-14 16:43:33.650402069,fid,2.399863,mdgan.5.cifar.server.logs.json,fid (logs/mdgan.5.cifar.server.logs.json),0


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,time_elapsed.swap_send,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,end.apply_gradients,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients
0,0,2024-05-14 16:43:29.834113143,2024-05-14 16:43:34.106158040,2024-05-14 16:43:29.834114097,2024-05-14 16:43:30.294969104,0.0,1.451478,20.996281,2024-05-14 16:43:30.294969104,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
1,1,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182240985,2024-05-14 16:43:34.141160987,2024-05-14 16:43:34.182239078,1.0,6.401400,25.901831,2024-05-14 16:43:34.182239078,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
2,2,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245211862,2024-05-14 16:43:34.213546060,2024-05-14 16:43:34.245209955,2.0,2.631938,19.150515,2024-05-14 16:43:34.245210908,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,3,2024-05-14 16:43:34.282121919,2024-05-14 16:43:34.328813098,2024-05-14 16:43:34.282123111,2024-05-14 16:43:34.328810953,3.0,0.904639,13.880597,2024-05-14 16:43:34.328810953,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,4,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397956155,2024-05-14 16:43:34.359533094,2024-05-14 16:43:34.397954009,4.0,0.113801,9.887267,2024-05-14 16:43:34.397955202,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,21,2024-05-14 16:43:35.507329940,2024-05-14 16:43:35.595337868,2024-05-14 16:43:35.507329940,2024-05-14 16:43:35.595336914,,,,NaT,NaT,...,,2024-05-14 16:43:35.508547068,2024-05-14 16:43:35.582589865,2024-05-14 16:43:35.582591057,2024-05-14 16:43:35.592504978,2024-05-14 16:43:35.592607021,2024-05-14 16:43:35.595336914,0.074043,0.009914,0.002730
22,22,2024-05-14 16:43:35.596124887,2024-05-14 16:43:35.684433937,2024-05-14 16:43:35.596124887,2024-05-14 16:43:35.684432983,,,,NaT,NaT,...,,2024-05-14 16:43:35.597498894,2024-05-14 16:43:35.671704054,2024-05-14 16:43:35.671705008,2024-05-14 16:43:35.681652069,2024-05-14 16:43:35.681797028,2024-05-14 16:43:35.684432983,0.074205,0.009947,0.002636
23,23,2024-05-14 16:43:35.685142994,2024-05-14 16:43:35.773658037,2024-05-14 16:43:35.685142994,2024-05-14 16:43:35.773657084,,,,NaT,NaT,...,,2024-05-14 16:43:35.686351061,2024-05-14 16:43:35.759729862,2024-05-14 16:43:35.759731054,2024-05-14 16:43:35.770834923,2024-05-14 16:43:35.770938158,2024-05-14 16:43:35.773657084,0.073379,0.011104,0.002719
24,24,2024-05-14 16:43:35.774384022,2024-05-14 16:43:35.866933107,2024-05-14 16:43:35.774384022,2024-05-14 16:43:35.866931915,,,,NaT,NaT,...,,2024-05-14 16:43:35.775619030,2024-05-14 16:43:35.850618124,2024-05-14 16:43:35.850618839,2024-05-14 16:43:35.863472939,2024-05-14 16:43:35.863605976,2024-05-14 16:43:35.866931915,0.074999,0.012854,0.003326


In [8]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [9]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [10]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
)

timeline.show()