In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [2]:
logs_path = Path("logs")

# Standalone

In [3]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_global_epochs_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_global_epochs_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_global_epochs_df = pd.concat(workers_global_epochs_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_global_epochs_df)
print(workers_events_pairs)
workers_global_epochs_df = compute_time_elapsed(workers_global_epochs_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_global_epochs_df)

[('start.epoch', 'end.epoch'), ('start.train', 'end.train'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 21:45:05.699313879,2024-05-14 21:45:06.244068146,epoch,0.544754,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),0
1,2024-05-14 21:45:06.244995117,2024-05-14 21:45:09.911029100,epoch,3.666034,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),1
2,2024-05-14 21:45:09.911430836,2024-05-14 21:45:09.989005089,epoch,0.077574,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),2
3,2024-05-14 21:45:09.989393950,2024-05-14 21:45:10.065896034,epoch,0.076502,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),3
4,2024-05-14 21:45:10.066376925,2024-05-14 21:45:10.144906044,epoch,0.078529,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),4
...,...,...,...,...,...,...,...
195,2024-05-14 21:45:35.611844063,2024-05-14 21:45:35.621448040,send,0.009604,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),195
196,2024-05-14 21:45:35.690350056,2024-05-14 21:45:35.699723959,send,0.009374,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),196
197,2024-05-14 21:45:35.770864010,2024-05-14 21:45:35.776813984,send,0.005950,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),197
198,2024-05-14 21:45:35.845504999,2024-05-14 21:45:35.854831219,send,0.009326,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),198


Unnamed: 0,epoch,start.epoch,end.epoch,start.train,end.train,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv,...,dataset,worker,log,world_size,time_elapsed.epoch,time_elapsed.train,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-14 21:45:05.699313879,2024-05-14 21:45:06.244068146,2024-05-14 21:45:05.779621840,2024-05-14 21:45:06.227791071,2024-05-14 21:45:05.702948093,2024-05-14 21:45:05.779621840,2024-05-14 21:45:06.227791071,2024-05-14 21:45:06.243999958,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.544754,0.448169,0.076674,0.016209,,
1,1,2024-05-14 21:45:06.244995117,2024-05-14 21:45:09.911029100,2024-05-14 21:45:09.857824087,2024-05-14 21:45:09.902035952,2024-05-14 21:45:06.247565985,2024-05-14 21:45:09.857822895,2024-05-14 21:45:09.902035952,2024-05-14 21:45:09.911013126,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,3.666034,0.044212,3.610257,0.008977,,
2,2,2024-05-14 21:45:09.911430836,2024-05-14 21:45:09.989005089,2024-05-14 21:45:09.960972071,2024-05-14 21:45:09.984203100,2024-05-14 21:45:09.913383961,2024-05-14 21:45:09.960972071,2024-05-14 21:45:09.984204054,2024-05-14 21:45:09.988994122,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.077574,0.023231,0.047588,0.004790,,
3,3,2024-05-14 21:45:09.989393950,2024-05-14 21:45:10.065896034,2024-05-14 21:45:10.038208961,2024-05-14 21:45:10.057468891,2024-05-14 21:45:09.990911961,2024-05-14 21:45:10.038208961,2024-05-14 21:45:10.057468891,2024-05-14 21:45:10.065866946,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.076502,0.019260,0.047297,0.008398,,
4,4,2024-05-14 21:45:10.066376925,2024-05-14 21:45:10.144906044,2024-05-14 21:45:10.114901066,2024-05-14 21:45:10.134977102,2024-05-14 21:45:10.068259001,2024-05-14 21:45:10.114901066,2024-05-14 21:45:10.134978056,2024-05-14 21:45:10.144794226,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.078529,0.020076,0.046642,0.009816,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,2024-05-14 21:45:35.543554068,2024-05-14 21:45:35.621465206,2024-05-14 21:45:35.591896057,2024-05-14 21:45:35.611844063,2024-05-14 21:45:35.545223951,2024-05-14 21:45:35.591895103,2024-05-14 21:45:35.611844063,2024-05-14 21:45:35.621448040,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.077911,0.019948,0.046671,0.009604,,
196,196,2024-05-14 21:45:35.624826908,2024-05-14 21:45:35.699741840,2024-05-14 21:45:35.671263933,2024-05-14 21:45:35.690350056,2024-05-14 21:45:35.626795053,2024-05-14 21:45:35.671262980,2024-05-14 21:45:35.690350056,2024-05-14 21:45:35.699723959,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.074915,0.019086,0.044468,0.009374,,
197,197,2024-05-14 21:45:35.704278946,2024-05-14 21:45:35.776832104,2024-05-14 21:45:35.748775959,2024-05-14 21:45:35.770864010,2024-05-14 21:45:35.707072020,2024-05-14 21:45:35.748775005,2024-05-14 21:45:35.770864010,2024-05-14 21:45:35.776813984,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.072553,0.022088,0.041703,0.005950,,
198,198,2024-05-14 21:45:35.780995131,2024-05-14 21:45:35.854849100,2024-05-14 21:45:35.826109886,2024-05-14 21:45:35.845504045,2024-05-14 21:45:35.783048868,2024-05-14 21:45:35.826108932,2024-05-14 21:45:35.845504999,2024-05-14 21:45:35.854831219,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.073854,0.019394,0.043060,0.009326,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.send_data', 'end.send_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 21:45:05.695155859,2024-05-14 21:45:09.826490164,epoch,4.131334,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),0
1,2024-05-14 21:45:09.826819181,2024-05-14 21:45:09.937199116,epoch,0.110380,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),1
2,2024-05-14 21:45:09.937690973,2024-05-14 21:45:10.001177073,epoch,0.063486,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),2
3,2024-05-14 21:45:10.001521826,2024-05-14 21:45:10.079141855,epoch,0.077620,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),3
4,2024-05-14 21:45:10.079499960,2024-05-14 21:45:10.158514023,epoch,0.079014,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),4
...,...,...,...,...,...,...,...
0,2024-05-14 21:45:06.488689899,2024-05-14 21:45:06.923895121,is,0.435205,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),0
50,2024-05-14 21:45:13.850691080,2024-05-14 21:45:14.251909971,is,0.401219,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),50
100,2024-05-14 21:45:20.799479961,2024-05-14 21:45:21.162173986,is,0.362694,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),100
150,2024-05-14 21:45:28.157515049,2024-05-14 21:45:28.581203938,is,0.423689,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),150


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,...,dataset,world_size,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 21:45:05.695155859,2024-05-14 21:45:09.826490164,2024-05-14 21:45:05.695155859,2024-05-14 21:45:06.480485916,2024-05-14 21:45:05.772237062,2024-05-14 21:45:06.258042097,2024-05-14 21:45:06.258054972,2024-05-14 21:45:06.374050140,2024-05-14 21:45:06.374284029,...,server,4,logs/mdgan.4.cifar.server.logs.json,4.131334,0.785330,0.485805,0.115995,0.106202,2.881332,0.435205
1,1,2024-05-14 21:45:09.826819181,2024-05-14 21:45:09.937199116,2024-05-14 21:45:09.826819181,2024-05-14 21:45:09.937198162,2024-05-14 21:45:09.830015898,2024-05-14 21:45:09.912538052,2024-05-14 21:45:09.912538767,2024-05-14 21:45:09.922826052,2024-05-14 21:45:09.922934055,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.110380,0.110379,0.082522,0.010287,0.014263,,
2,2,2024-05-14 21:45:09.937690973,2024-05-14 21:45:10.001177073,2024-05-14 21:45:09.937690973,2024-05-14 21:45:10.001176119,2024-05-14 21:45:09.939045191,2024-05-14 21:45:09.990116119,2024-05-14 21:45:09.990117073,2024-05-14 21:45:09.998420954,2024-05-14 21:45:09.998512030,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.063486,0.063485,0.051071,0.008304,0.002664,,
3,3,2024-05-14 21:45:10.001521826,2024-05-14 21:45:10.079141855,2024-05-14 21:45:10.001521826,2024-05-14 21:45:10.079141140,2024-05-14 21:45:10.003229856,2024-05-14 21:45:10.067592859,2024-05-14 21:45:10.067592859,2024-05-14 21:45:10.076255083,2024-05-14 21:45:10.076382875,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.077620,0.077619,0.064363,0.008662,0.002758,,
4,4,2024-05-14 21:45:10.079499960,2024-05-14 21:45:10.158514023,2024-05-14 21:45:10.079499960,2024-05-14 21:45:10.158514023,2024-05-14 21:45:10.080741882,2024-05-14 21:45:10.145472050,2024-05-14 21:45:10.145473003,2024-05-14 21:45:10.155626059,2024-05-14 21:45:10.155745029,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.079014,0.079014,0.064730,0.010153,0.002768,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,2024-05-14 21:45:35.557549000,2024-05-14 21:45:35.633895874,2024-05-14 21:45:35.557549000,2024-05-14 21:45:35.633894920,2024-05-14 21:45:35.558845043,2024-05-14 21:45:35.621862888,2024-05-14 21:45:35.621863842,2024-05-14 21:45:35.630892038,2024-05-14 21:45:35.631006002,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.076347,0.076346,0.063018,0.009028,0.002888,,
196,196,2024-05-14 21:45:35.637325048,2024-05-14 21:45:35.711971998,2024-05-14 21:45:35.637325048,2024-05-14 21:45:35.711971045,2024-05-14 21:45:35.638629198,2024-05-14 21:45:35.700303793,2024-05-14 21:45:35.700304985,2024-05-14 21:45:35.709089041,2024-05-14 21:45:35.709217072,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.074647,0.074646,0.061675,0.008784,0.002753,,
197,197,2024-05-14 21:45:35.715347052,2024-05-14 21:45:35.789960861,2024-05-14 21:45:35.715347052,2024-05-14 21:45:35.789959908,2024-05-14 21:45:35.716833115,2024-05-14 21:45:35.777381182,2024-05-14 21:45:35.777381897,2024-05-14 21:45:35.786885977,2024-05-14 21:45:35.786990166,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.074614,0.074613,0.060548,0.009504,0.002968,,
198,198,2024-05-14 21:45:35.793550014,2024-05-14 21:45:35.867182016,2024-05-14 21:45:35.793550014,2024-05-14 21:45:35.867181063,2024-05-14 21:45:35.795098066,2024-05-14 21:45:35.855318069,2024-05-14 21:45:35.855319023,2024-05-14 21:45:35.864324093,2024-05-14 21:45:35.864474773,...,server,4,logs/mdgan.4.cifar.server.logs.json,0.073632,0.073631,0.060220,0.009005,0.002706,,


In [6]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))
logs_standalones

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: 0 days 00:03:17.087476969
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.is,fid,is,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is
0,0,2024-05-14 21:45:05.695154890,2024-05-14 21:45:10.010076985,2024-05-14 21:45:05.695156082,2024-05-14 21:45:06.204516157,0,1.472082,2.691529,2024-05-14 21:45:06.204516157,NaT,...,2024-05-14 21:45:10.010041937,427.379852,1.097491,cifar,cifar.standalone.logs.json,4.314922,0.509360,,3.234618,0.553048
1,1,2024-05-14 21:45:10.041023001,2024-05-14 21:45:10.076940998,2024-05-14 21:45:10.041023955,2024-05-14 21:45:10.076936945,1,1.784279,2.096194,2024-05-14 21:45:10.076936945,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.035918,0.035913,,,
2,2,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124038920,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124037012,2,1.396755,2.365873,2024-05-14 21:45:10.124037012,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.021186,0.021184,,,
3,3,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174702868,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174701199,3,1.701418,2.404415,2024-05-14 21:45:10.174701199,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.024716,0.024714,,,
4,4,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230028137,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230025038,4,1.705129,2.500046,2024-05-14 21:45:10.230025038,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.026877,0.026874,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,2024-05-14 21:45:33.561598047,2024-05-14 21:45:33.588718875,2024-05-14 21:45:33.561598047,2024-05-14 21:45:33.588716015,195,0.740402,3.083466,2024-05-14 21:45:33.588717206,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.027121,0.027118,,,
196,196,2024-05-14 21:45:33.616292223,2024-05-14 21:45:33.640331015,2024-05-14 21:45:33.616292937,2024-05-14 21:45:33.640327915,196,0.848194,2.921893,2024-05-14 21:45:33.640329107,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.024039,0.024035,,,
197,197,2024-05-14 21:45:33.669688210,2024-05-14 21:45:33.691757187,2024-05-14 21:45:33.669688210,2024-05-14 21:45:33.691755041,197,0.564557,4.385487,2024-05-14 21:45:33.691755041,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.022069,0.022067,,,
198,198,2024-05-14 21:45:33.718889937,2024-05-14 21:45:33.743203148,2024-05-14 21:45:33.718889937,2024-05-14 21:45:33.743201002,198,0.591694,4.139916,2024-05-14 21:45:33.743201002,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.024313,0.024311,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 21:45:05.695154890,2024-05-14 21:45:10.010076985,epoch,4.314922,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 21:45:10.041023001,2024-05-14 21:45:10.076940998,epoch,0.035918,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124038920,epoch,0.021186,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174702868,epoch,0.024716,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230028137,epoch,0.026877,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
150,2024-05-14 21:45:27.156115040,2024-05-14 21:45:30.579238877,fid,3.423124,cifar.standalone.logs.json,fid (logs/cifar.standalone.logs.json),150
0,2024-05-14 21:45:09.456994042,2024-05-14 21:45:10.010041937,is,0.553048,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),0
50,2024-05-14 21:45:16.527879938,2024-05-14 21:45:17.118575796,is,0.590696,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),50
100,2024-05-14 21:45:23.796581968,2024-05-14 21:45:24.346409067,is,0.549827,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),100


In [7]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_global_epochs_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-14 21:45:05.695154890,2024-05-14 21:45:10.010076985,epoch,4.314922,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-14 21:45:10.041023001,2024-05-14 21:45:10.076940998,epoch,0.035918,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124038920,epoch,0.021186,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174702868,epoch,0.024716,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230028137,epoch,0.026877,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
0,2024-05-14 21:45:06.488689899,2024-05-14 21:45:06.923895121,is,0.435205,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),0
50,2024-05-14 21:45:13.850691080,2024-05-14 21:45:14.251909971,is,0.401219,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),50
100,2024-05-14 21:45:20.799479961,2024-05-14 21:45:21.162173986,is,0.362694,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),100
150,2024-05-14 21:45:28.157515049,2024-05-14 21:45:28.581203938,is,0.423689,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),150


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,time_elapsed.swap_send,start.send_data,end.send_data,start.calc_gradients,end.calc_gradients,start.apply_gradients,end.apply_gradients,time_elapsed.send_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients
0,0,2024-05-14 21:45:05.695154890,2024-05-14 21:45:10.010076985,2024-05-14 21:45:05.695156082,2024-05-14 21:45:06.204516157,0.0,1.472082,2.691529,2024-05-14 21:45:06.204516157,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
1,1,2024-05-14 21:45:10.041023001,2024-05-14 21:45:10.076940998,2024-05-14 21:45:10.041023955,2024-05-14 21:45:10.076936945,1.0,1.784279,2.096194,2024-05-14 21:45:10.076936945,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
2,2,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124038920,2024-05-14 21:45:10.102852806,2024-05-14 21:45:10.124037012,2.0,1.396755,2.365873,2024-05-14 21:45:10.124037012,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,3,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174702868,2024-05-14 21:45:10.149987206,2024-05-14 21:45:10.174701199,3.0,1.701418,2.404415,2024-05-14 21:45:10.174701199,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,4,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230028137,2024-05-14 21:45:10.203150973,2024-05-14 21:45:10.230025038,4.0,1.705129,2.500046,2024-05-14 21:45:10.230025038,NaT,...,,NaT,NaT,NaT,NaT,NaT,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,2024-05-14 21:45:35.557549000,2024-05-14 21:45:35.633895874,2024-05-14 21:45:35.557549000,2024-05-14 21:45:35.633894920,,,,NaT,NaT,...,,2024-05-14 21:45:35.558845043,2024-05-14 21:45:35.621862888,2024-05-14 21:45:35.621863842,2024-05-14 21:45:35.630892038,2024-05-14 21:45:35.631006002,2024-05-14 21:45:35.633893967,0.063018,0.009028,0.002888
196,196,2024-05-14 21:45:35.637325048,2024-05-14 21:45:35.711971998,2024-05-14 21:45:35.637325048,2024-05-14 21:45:35.711971045,,,,NaT,NaT,...,,2024-05-14 21:45:35.638629198,2024-05-14 21:45:35.700303793,2024-05-14 21:45:35.700304985,2024-05-14 21:45:35.709089041,2024-05-14 21:45:35.709217072,2024-05-14 21:45:35.711970091,0.061675,0.008784,0.002753
197,197,2024-05-14 21:45:35.715347052,2024-05-14 21:45:35.789960861,2024-05-14 21:45:35.715347052,2024-05-14 21:45:35.789959908,,,,NaT,NaT,...,,2024-05-14 21:45:35.716833115,2024-05-14 21:45:35.777381182,2024-05-14 21:45:35.777381897,2024-05-14 21:45:35.786885977,2024-05-14 21:45:35.786990166,2024-05-14 21:45:35.789958000,0.060548,0.009504,0.002968
198,198,2024-05-14 21:45:35.793550014,2024-05-14 21:45:35.867182016,2024-05-14 21:45:35.793550014,2024-05-14 21:45:35.867181063,,,,NaT,NaT,...,,2024-05-14 21:45:35.795098066,2024-05-14 21:45:35.855318069,2024-05-14 21:45:35.855319023,2024-05-14 21:45:35.864324093,2024-05-14 21:45:35.864474773,2024-05-14 21:45:35.867181063,0.060220,0.009005,0.002706


In [8]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [9]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [10]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
)

timeline.show()