In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [2]:
logs_path = Path("logs")

# Standalone

In [3]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_global_epochs_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_global_epochs_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_global_epochs_df = pd.concat(workers_global_epochs_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_global_epochs_df)
print(workers_events_pairs)
workers_global_epochs_df = compute_time_elapsed(workers_global_epochs_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_global_epochs_df)

[('start.epoch', 'end.epoch'), ('start.calc_gradients', 'end.calc_gradients'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:23:06.433415890,2024-05-14 22:23:07.045869112,epoch,0.612453,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),0
1,2024-05-14 22:23:07.048237085,2024-05-14 22:23:10.693565845,epoch,3.645329,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),1
2,2024-05-14 22:23:10.694122076,2024-05-14 22:23:10.769629002,epoch,0.075507,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),2
3,2024-05-14 22:23:10.769990921,2024-05-14 22:23:10.845792055,epoch,0.075801,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),3
4,2024-05-14 22:23:10.846146107,2024-05-14 22:23:10.917882204,epoch,0.071736,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),4
...,...,...,...,...,...,...,...
995,2024-05-14 22:25:36.362427950,2024-05-14 22:25:36.371290207,send,0.008862,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),995
996,2024-05-14 22:25:36.446734190,2024-05-14 22:25:36.454536915,send,0.007803,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),996
997,2024-05-14 22:25:36.525388002,2024-05-14 22:25:36.533595799,send,0.008208,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),997
998,2024-05-14 22:25:36.599870920,2024-05-14 22:25:36.607350111,send,0.007479,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),998


Unnamed: 0,epoch,start.epoch,end.epoch,start.calc_gradients,end.calc_gradients,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv,...,dataset,worker,log,world_size,time_elapsed.epoch,time_elapsed.calc_gradients,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-14 22:23:06.433415890,2024-05-14 22:23:07.045869112,2024-05-14 22:23:06.612890959,2024-05-14 22:23:07.020805120,2024-05-14 22:23:06.438347101,2024-05-14 22:23:06.612890005,2024-05-14 22:23:07.020805120,2024-05-14 22:23:07.045867920,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.612453,0.407914,0.174543,0.025063,,
1,1,2024-05-14 22:23:07.048237085,2024-05-14 22:23:10.693565845,2024-05-14 22:23:10.645371914,2024-05-14 22:23:10.681859016,2024-05-14 22:23:07.051372051,2024-05-14 22:23:10.645371914,2024-05-14 22:23:10.681859016,2024-05-14 22:23:10.693549156,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,3.645329,0.036487,3.594000,0.011690,,
2,2,2024-05-14 22:23:10.694122076,2024-05-14 22:23:10.769629002,2024-05-14 22:23:10.747546196,2024-05-14 22:23:10.763655901,2024-05-14 22:23:10.696921110,2024-05-14 22:23:10.747546196,2024-05-14 22:23:10.763656855,2024-05-14 22:23:10.769629002,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.075507,0.016110,0.050625,0.005972,,
3,3,2024-05-14 22:23:10.769990921,2024-05-14 22:23:10.845792055,2024-05-14 22:23:10.820583820,2024-05-14 22:23:10.836618900,2024-05-14 22:23:10.771914005,2024-05-14 22:23:10.820583820,2024-05-14 22:23:10.836618900,2024-05-14 22:23:10.845792055,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.075801,0.016035,0.048670,0.009173,,
4,4,2024-05-14 22:23:10.846146107,2024-05-14 22:23:10.917882204,2024-05-14 22:23:10.893097878,2024-05-14 22:23:10.909718990,2024-05-14 22:23:10.847806931,2024-05-14 22:23:10.893097878,2024-05-14 22:23:10.909720182,2024-05-14 22:23:10.917881012,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.071736,0.016621,0.045291,0.008161,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:25:36.311732054,2024-05-14 22:25:36.371290922,2024-05-14 22:25:36.346503019,2024-05-14 22:25:36.362427950,2024-05-14 22:25:36.313668013,2024-05-14 22:25:36.346503019,2024-05-14 22:25:36.362427950,2024-05-14 22:25:36.371290207,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.059559,0.015925,0.032835,0.008862,,
996,996,2024-05-14 22:25:36.387012005,2024-05-14 22:25:36.454537868,2024-05-14 22:25:36.429512024,2024-05-14 22:25:36.446734190,2024-05-14 22:25:36.388687849,2024-05-14 22:25:36.429510832,2024-05-14 22:25:36.446734190,2024-05-14 22:25:36.454536915,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.067526,0.017222,0.040823,0.007803,,
997,997,2024-05-14 22:25:36.470149994,2024-05-14 22:25:36.533596992,2024-05-14 22:25:36.509669781,2024-05-14 22:25:36.525387049,2024-05-14 22:25:36.473182917,2024-05-14 22:25:36.509669065,2024-05-14 22:25:36.525388002,2024-05-14 22:25:36.533595799,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.063447,0.015717,0.036486,0.008208,,
998,998,2024-05-14 22:25:36.548713923,2024-05-14 22:25:36.607351065,2024-05-14 22:25:36.584058046,2024-05-14 22:25:36.599869967,2024-05-14 22:25:36.551215172,2024-05-14 22:25:36.584057093,2024-05-14 22:25:36.599870920,2024-05-14 22:25:36.607350111,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.058637,0.015812,0.032842,0.007479,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.send_data', 'end.send_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.generate_data', 'end.generate_data'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:23:06.430808067,2024-05-14 22:23:10.612509966,epoch,4.181702,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),0
1,2024-05-14 22:23:10.614876986,2024-05-14 22:23:10.720007896,epoch,0.105131,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),1
2,2024-05-14 22:23:10.720438957,2024-05-14 22:23:10.785164118,epoch,0.064725,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),2
3,2024-05-14 22:23:10.785537958,2024-05-14 22:23:10.857501984,epoch,0.071964,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),3
4,2024-05-14 22:23:10.857892036,2024-05-14 22:23:10.930742025,epoch,0.072850,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),4
...,...,...,...,...,...,...,...
800,2024-05-14 22:25:06.227975845,2024-05-14 22:25:06.742562056,is,0.514586,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),800
850,2024-05-14 22:25:13.789529085,2024-05-14 22:25:14.289168835,is,0.499640,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),850
900,2024-05-14 22:25:21.208876133,2024-05-14 22:25:21.698199987,is,0.489324,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),900
950,2024-05-14 22:25:29.020815134,2024-05-14 22:25:29.488636971,is,0.467822,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),950


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,...,world_size,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.generate_data,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 22:23:06.430808067,2024-05-14 22:23:10.612509966,2024-05-14 22:23:06.430808067,2024-05-14 22:23:07.312996149,2024-05-14 22:23:06.568664074,2024-05-14 22:23:07.047863960,2024-05-14 22:23:07.047866821,2024-05-14 22:23:07.190335035,2024-05-14 22:23:07.190530062,...,4,logs/mdgan.4.cifar.server.logs.json,4.181702,0.882188,0.479200,0.142468,0.122465,0.137854,2.812583,0.450243
1,1,2024-05-14 22:23:10.614876986,2024-05-14 22:23:10.720007896,2024-05-14 22:23:10.614876986,2024-05-14 22:23:10.720006943,2024-05-14 22:23:10.618515015,2024-05-14 22:23:10.696085930,2024-05-14 22:23:10.696089029,2024-05-14 22:23:10.710766077,2024-05-14 22:23:10.710970163,...,4,logs/mdgan.4.cifar.server.logs.json,0.105131,0.105130,0.077571,0.014677,0.009036,0.003633,,
2,2,2024-05-14 22:23:10.720438957,2024-05-14 22:23:10.785164118,2024-05-14 22:23:10.720438957,2024-05-14 22:23:10.785162926,2024-05-14 22:23:10.723958969,2024-05-14 22:23:10.773600101,2024-05-14 22:23:10.773600101,2024-05-14 22:23:10.782086849,2024-05-14 22:23:10.782187939,...,4,logs/mdgan.4.cifar.server.logs.json,0.064725,0.064724,0.049641,0.008487,0.002975,0.003517,,
3,3,2024-05-14 22:23:10.785537958,2024-05-14 22:23:10.857501984,2024-05-14 22:23:10.785537958,2024-05-14 22:23:10.857501030,2024-05-14 22:23:10.787040949,2024-05-14 22:23:10.846202135,2024-05-14 22:23:10.846202135,2024-05-14 22:23:10.854683876,2024-05-14 22:23:10.854773045,...,4,logs/mdgan.4.cifar.server.logs.json,0.071964,0.071963,0.059161,0.008482,0.002728,0.001502,,
4,4,2024-05-14 22:23:10.857892036,2024-05-14 22:23:10.930742025,2024-05-14 22:23:10.857892036,2024-05-14 22:23:10.930741072,2024-05-14 22:23:10.859188080,2024-05-14 22:23:10.919283867,2024-05-14 22:23:10.919283867,2024-05-14 22:23:10.927673101,2024-05-14 22:23:10.927762032,...,4,logs/mdgan.4.cifar.server.logs.json,0.072850,0.072849,0.060096,0.008389,0.002978,0.001295,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:25:36.325434208,2024-05-14 22:25:36.385940075,2024-05-14 22:25:36.325434208,2024-05-14 22:25:36.385938883,2024-05-14 22:25:36.326800823,2024-05-14 22:25:36.372295856,2024-05-14 22:25:36.372298002,2024-05-14 22:25:36.383288145,2024-05-14 22:25:36.383383036,...,4,logs/mdgan.4.cifar.server.logs.json,0.060506,0.060505,0.045495,0.010990,0.002556,0.001366,,
996,996,2024-05-14 22:25:36.403840065,2024-05-14 22:25:36.471863031,2024-05-14 22:25:36.403840065,2024-05-14 22:25:36.471863031,2024-05-14 22:25:36.405632973,2024-05-14 22:25:36.456437111,2024-05-14 22:25:36.456438065,2024-05-14 22:25:36.469218016,2024-05-14 22:25:36.469312906,...,4,logs/mdgan.4.cifar.server.logs.json,0.068023,0.068023,0.050804,0.012780,0.002549,0.001791,,
997,997,2024-05-14 22:25:36.489768982,2024-05-14 22:25:36.546022892,2024-05-14 22:25:36.489768982,2024-05-14 22:25:36.546021938,2024-05-14 22:25:36.491616011,2024-05-14 22:25:36.534114121,2024-05-14 22:25:36.534116029,2024-05-14 22:25:36.542439938,2024-05-14 22:25:36.542537928,...,4,logs/mdgan.4.cifar.server.logs.json,0.056254,0.056253,0.042498,0.008324,0.003483,0.001844,,
998,998,2024-05-14 22:25:36.564143181,2024-05-14 22:25:36.623219013,2024-05-14 22:25:36.564143181,2024-05-14 22:25:36.623217821,2024-05-14 22:25:36.565649986,2024-05-14 22:25:36.609875917,2024-05-14 22:25:36.609877110,2024-05-14 22:25:36.620476961,2024-05-14 22:25:36.620581865,...,4,logs/mdgan.4.cifar.server.logs.json,0.059076,0.059075,0.044226,0.010600,0.002635,0.001505,,


In [6]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))
logs_standalones

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: 0 days 00:02:41.260314226
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.is,fid,is,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 22:23:06.430807841,2024-05-14 22:23:11.191315186,2024-05-14 22:23:06.430809033,2024-05-14 22:23:06.963418973,0,1.472082,2.691529,2024-05-14 22:23:06.963418973,NaT,...,2024-05-14 22:23:11.191244853,427.379852,1.097491,cifar,cifar.standalone.logs.json,4.760507,0.532610,,3.541883,0.665565
1,1,2024-05-14 22:23:11.226598990,2024-05-14 22:23:11.281307948,2024-05-14 22:23:11.226600182,2024-05-14 22:23:11.281303895,1,1.784279,2.096194,2024-05-14 22:23:11.281305087,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.054709,0.054704,,,
2,2,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371231091,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371227992,2,1.396755,2.365873,2024-05-14 22:23:11.371227992,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.034941,0.034938,,,
3,3,2024-05-14 22:23:11.420036090,2024-05-14 22:23:11.447411073,2024-05-14 22:23:11.420037043,2024-05-14 22:23:11.447407019,3,1.701418,2.404415,2024-05-14 22:23:11.447407973,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.027375,0.027370,,,
4,4,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505306018,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505303872,4,1.705129,2.500046,2024-05-14 22:23:11.505303872,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.023476,0.023474,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:25:31.883358014,2024-05-14 22:25:31.911677134,2024-05-14 22:25:31.883358014,2024-05-14 22:25:31.911668790,995,0.500290,3.374500,2024-05-14 22:25:31.911669982,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.028319,0.028311,,,
996,996,2024-05-14 22:25:31.955986035,2024-05-14 22:25:31.990014089,2024-05-14 22:25:31.955986989,2024-05-14 22:25:31.990010989,996,0.727101,2.680925,2024-05-14 22:25:31.990010989,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.034028,0.034024,,,
997,997,2024-05-14 22:25:32.031208051,2024-05-14 22:25:32.054815066,2024-05-14 22:25:32.031208051,2024-05-14 22:25:32.054813159,997,1.054406,1.746657,2024-05-14 22:25:32.054813159,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.023607,0.023605,,,
998,998,2024-05-14 22:25:32.106869948,2024-05-14 22:25:32.138340962,2024-05-14 22:25:32.106869948,2024-05-14 22:25:32.138337863,998,1.092340,1.281015,2024-05-14 22:25:32.138337863,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.031471,0.031468,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:23:06.430807841,2024-05-14 22:23:11.191315186,epoch,4.760507,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 22:23:11.226598990,2024-05-14 22:23:11.281307948,epoch,0.054709,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371231091,epoch,0.034941,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 22:23:11.420036090,2024-05-14 22:23:11.447411073,epoch,0.027375,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505306018,epoch,0.023476,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
750,2024-05-14 22:24:57.306677831,2024-05-14 22:24:57.903128875,is,0.596451,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),750
800,2024-05-14 22:25:05.047878993,2024-05-14 22:25:05.636351121,is,0.588472,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),800
850,2024-05-14 22:25:12.784131062,2024-05-14 22:25:13.354925883,is,0.570795,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),850
900,2024-05-14 22:25:20.490826142,2024-05-14 22:25:21.042243016,is,0.551417,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),900


In [7]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_global_epochs_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 22:23:06.430807841,2024-05-14 22:23:11.191315186,epoch,4.760507,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 22:23:11.226598990,2024-05-14 22:23:11.281307948,epoch,0.054709,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371231091,epoch,0.034941,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 22:23:11.420036090,2024-05-14 22:23:11.447411073,epoch,0.027375,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505306018,epoch,0.023476,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
800,2024-05-14 22:25:06.227975845,2024-05-14 22:25:06.742562056,is,0.514586,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),800
850,2024-05-14 22:25:13.789529085,2024-05-14 22:25:14.289168835,is,0.499640,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),850
900,2024-05-14 22:25:21.208876133,2024-05-14 22:25:21.698199987,is,0.489324,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),900
950,2024-05-14 22:25:29.020815134,2024-05-14 22:25:29.488636971,is,0.467822,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),950


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,time_elapsed.swap_send,start.send_data,end.send_data,start.apply_gradients,end.apply_gradients,start.generate_data,end.generate_data,time_elapsed.send_data,time_elapsed.apply_gradients,time_elapsed.generate_data
0,0,2024-05-14 22:23:06.430807841,2024-05-14 22:23:11.191315186,2024-05-14 22:23:06.430809033,2024-05-14 22:23:06.963418973,0.0,1.472082,2.691529,2024-05-14 22:23:06.963418973,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
1,1,2024-05-14 22:23:11.226598990,2024-05-14 22:23:11.281307948,2024-05-14 22:23:11.226600182,2024-05-14 22:23:11.281303895,1.0,1.784279,2.096194,2024-05-14 22:23:11.281305087,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
2,2,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371231091,2024-05-14 22:23:11.336289895,2024-05-14 22:23:11.371227992,2.0,1.396755,2.365873,2024-05-14 22:23:11.371227992,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,3,2024-05-14 22:23:11.420036090,2024-05-14 22:23:11.447411073,2024-05-14 22:23:11.420037043,2024-05-14 22:23:11.447407019,3.0,1.701418,2.404415,2024-05-14 22:23:11.447407973,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,4,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505306018,2024-05-14 22:23:11.481830132,2024-05-14 22:23:11.505303872,4.0,1.705129,2.500046,2024-05-14 22:23:11.505303872,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-14 22:25:36.325434208,2024-05-14 22:25:36.385940075,2024-05-14 22:25:36.325434208,2024-05-14 22:25:36.385938883,,,,NaT,NaT,...,,2024-05-14 22:25:36.326800823,2024-05-14 22:25:36.372295856,2024-05-14 22:25:36.383383036,2024-05-14 22:25:36.385938883,2024-05-14 22:25:36.325434923,2024-05-14 22:25:36.326800823,0.045495,0.002556,0.001366
996,996,2024-05-14 22:25:36.403840065,2024-05-14 22:25:36.471863031,2024-05-14 22:25:36.403840065,2024-05-14 22:25:36.471863031,,,,NaT,NaT,...,,2024-05-14 22:25:36.405632973,2024-05-14 22:25:36.456437111,2024-05-14 22:25:36.469312906,2024-05-14 22:25:36.471862078,2024-05-14 22:25:36.403841019,2024-05-14 22:25:36.405632019,0.050804,0.002549,0.001791
997,997,2024-05-14 22:25:36.489768982,2024-05-14 22:25:36.546022892,2024-05-14 22:25:36.489768982,2024-05-14 22:25:36.546021938,,,,NaT,NaT,...,,2024-05-14 22:25:36.491616011,2024-05-14 22:25:36.534114121,2024-05-14 22:25:36.542537928,2024-05-14 22:25:36.546021223,2024-05-14 22:25:36.489770889,2024-05-14 22:25:36.491615057,0.042498,0.003483,0.001844
998,998,2024-05-14 22:25:36.564143181,2024-05-14 22:25:36.623219013,2024-05-14 22:25:36.564143181,2024-05-14 22:25:36.623217821,,,,NaT,NaT,...,,2024-05-14 22:25:36.565649986,2024-05-14 22:25:36.609875917,2024-05-14 22:25:36.620581865,2024-05-14 22:25:36.623216867,2024-05-14 22:25:36.564144850,2024-05-14 22:25:36.565649986,0.044226,0.002635,0.001505


In [8]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [9]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [10]:
mean_time_elapsed = workers_events_df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="event", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="event", title="Mean time elapsed", template="plotly_white").show()

In [11]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
)

timeline.show()