In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [2]:
logs_path = Path("logs")

# Standalone

In [3]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_df = pd.concat(workers_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_df)
print(workers_events_pairs)
workers_df = compute_time_elapsed(workers_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_df)

[('start.epoch', 'end.epoch'), ('start.calc_gradients', 'end.calc_gradients'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-17 12:34:41.269691944,2024-05-17 12:34:41.935734987,epoch,0.666043,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),0
1,2024-05-17 12:34:41.936331987,2024-05-17 12:34:45.955013037,epoch,4.018681,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),1
2,2024-05-17 12:34:45.955644846,2024-05-17 12:34:46.084653139,epoch,0.129008,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),2
3,2024-05-17 12:34:46.085072994,2024-05-17 12:34:46.159393072,epoch,0.074320,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),3
4,2024-05-17 12:34:46.159786940,2024-05-17 12:34:46.238758802,epoch,0.078972,mdgan.4.cifar.worker.1.logs.json,epoch (logs/mdgan.4.cifar.worker.1.logs.json),4
...,...,...,...,...,...,...,...
96,2024-05-17 12:34:56.868916988,2024-05-17 12:34:56.875380039,send,0.006463,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),96
97,2024-05-17 12:34:56.944839001,2024-05-17 12:34:56.953052998,send,0.008214,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),97
98,2024-05-17 12:34:57.019207001,2024-05-17 12:34:57.028297901,send,0.009091,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),98
99,2024-05-17 12:34:57.094881058,2024-05-17 12:34:57.101339817,send,0.006459,mdgan.4.cifar.worker.4.logs.json,send (logs/mdgan.4.cifar.worker.4.logs.json),99


Unnamed: 0,epoch,start.epoch,end.epoch,start.calc_gradients,end.calc_gradients,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv,...,dataset,worker,log,world_size,time_elapsed.epoch,time_elapsed.calc_gradients,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-17 12:34:41.269691944,2024-05-17 12:34:41.935734987,2024-05-17 12:34:41.392537117,2024-05-17 12:34:41.912353992,2024-05-17 12:34:41.274213791,2024-05-17 12:34:41.392537117,2024-05-17 12:34:41.912355185,2024-05-17 12:34:41.935734034,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.666043,0.519817,0.118323,0.023379,,
1,1,2024-05-17 12:34:41.936331987,2024-05-17 12:34:45.955013037,2024-05-17 12:34:45.878186941,2024-05-17 12:34:45.944849014,2024-05-17 12:34:41.940551996,2024-05-17 12:34:45.878102064,2024-05-17 12:34:45.944849014,2024-05-17 12:34:45.955011129,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,4.018681,0.066662,3.937550,0.010162,,
2,2,2024-05-17 12:34:45.955644846,2024-05-17 12:34:46.084653139,2024-05-17 12:34:46.060517788,2024-05-17 12:34:46.077808142,2024-05-17 12:34:45.958003998,2024-05-17 12:34:46.060517788,2024-05-17 12:34:46.077808857,2024-05-17 12:34:46.084650040,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.129008,0.017290,0.102514,0.006841,,
3,3,2024-05-17 12:34:46.085072994,2024-05-17 12:34:46.159393072,2024-05-17 12:34:46.136637926,2024-05-17 12:34:46.153148890,2024-05-17 12:34:46.087091923,2024-05-17 12:34:46.136636972,2024-05-17 12:34:46.153148890,2024-05-17 12:34:46.159392118,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.074320,0.016511,0.049545,0.006243,,
4,4,2024-05-17 12:34:46.159786940,2024-05-17 12:34:46.238758802,2024-05-17 12:34:46.213380098,2024-05-17 12:34:46.230787039,2024-05-17 12:34:46.161608934,2024-05-17 12:34:46.213380098,2024-05-17 12:34:46.230787992,2024-05-17 12:34:46.238757849,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.1.logs.json,4,0.078972,0.017407,0.051771,0.007970,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,2024-05-17 12:34:56.804940939,2024-05-17 12:34:56.875380993,2024-05-17 12:34:56.853054047,2024-05-17 12:34:56.868916035,2024-05-17 12:34:56.806840181,2024-05-17 12:34:56.853054047,2024-05-17 12:34:56.868916988,2024-05-17 12:34:56.875380039,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.070440,0.015862,0.046214,0.006463,,
97,97,2024-05-17 12:34:56.877390146,2024-05-17 12:34:56.953053951,2024-05-17 12:34:56.928361893,2024-05-17 12:34:56.944839001,2024-05-17 12:34:56.879200935,2024-05-17 12:34:56.928361893,2024-05-17 12:34:56.944839001,2024-05-17 12:34:56.953052998,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.075664,0.016477,0.049161,0.008214,,
98,98,2024-05-17 12:34:56.955003977,2024-05-17 12:34:57.028299093,2024-05-17 12:34:57.003547907,2024-05-17 12:34:57.019207001,2024-05-17 12:34:56.957017183,2024-05-17 12:34:57.003547907,2024-05-17 12:34:57.019207001,2024-05-17 12:34:57.028297901,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.073295,0.015659,0.046531,0.009091,,
99,99,2024-05-17 12:34:57.030099869,2024-05-17 12:34:57.101341009,2024-05-17 12:34:57.079260826,2024-05-17 12:34:57.094881058,2024-05-17 12:34:57.032063961,2024-05-17 12:34:57.079260826,2024-05-17 12:34:57.094881058,2024-05-17 12:34:57.101339817,NaT,...,worker,logs,logs/mdgan.4.cifar.worker.4.logs.json,4,0.071241,0.015620,0.047197,0.006459,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.send_data', 'end.send_data'), ('start.recv_data', 'end.recv_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.generate_data', 'end.generate_data'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-17 12:34:41.268101215,2024-05-17 12:34:45.847975969,epoch,4.579875,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),0
1,2024-05-17 12:34:45.848304033,2024-05-17 12:34:46.014287949,epoch,0.165984,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),1
2,2024-05-17 12:34:46.014853954,2024-05-17 12:34:46.100283861,epoch,0.085430,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),2
3,2024-05-17 12:34:46.100704908,2024-05-17 12:34:46.175889015,epoch,0.075184,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),3
4,2024-05-17 12:34:46.176311970,2024-05-17 12:34:46.251440048,epoch,0.075128,mdgan.4.cifar.server.logs.json,epoch (logs/mdgan.4.cifar.server.logs.json),4
...,...,...,...,...,...,...,...
99,2024-05-17 12:34:57.046897888,2024-05-17 12:34:57.048532009,generate_data,0.001634,mdgan.4.cifar.server.logs.json,generate_data (logs/mdgan.4.cifar.server.logs....,99
0,2024-05-17 12:34:42.691653013,2024-05-17 12:34:45.824069977,fid,3.132417,mdgan.4.cifar.server.logs.json,fid (logs/mdgan.4.cifar.server.logs.json),0
50,2024-05-17 12:34:50.220789909,2024-05-17 12:34:53.242014885,fid,3.021225,mdgan.4.cifar.server.logs.json,fid (logs/mdgan.4.cifar.server.logs.json),50
0,2024-05-17 12:34:42.219161034,2024-05-17 12:34:42.691653013,is,0.472492,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),0


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.send_data,end.send_data,start.recv_data,end.recv_data,start.calc_gradients,...,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.send_data,time_elapsed.recv_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.generate_data,time_elapsed.fid,time_elapsed.is
0,0,2024-05-17 12:34:41.268101215,2024-05-17 12:34:45.847975969,2024-05-17 12:34:41.268101215,2024-05-17 12:34:42.210849047,2024-05-17 12:34:41.378783941,2024-05-17 12:34:41.393336058,2024-05-17 12:34:41.393359900,2024-05-17 12:34:41.957900047,2024-05-17 12:34:41.958482027,...,logs/mdgan.4.cifar.server.logs.json,4.579875,0.942748,0.014552,0.564540,0.142059,0.110188,0.110674,3.132417,0.472492
1,1,2024-05-17 12:34:45.848304033,2024-05-17 12:34:46.014287949,2024-05-17 12:34:45.848304033,2024-05-17 12:34:46.014286995,2024-05-17 12:34:45.851809978,2024-05-17 12:34:45.878325939,2024-05-17 12:34:45.878343821,2024-05-17 12:34:45.989711046,2024-05-17 12:34:45.990226984,...,logs/mdgan.4.cifar.server.logs.json,0.165984,0.165983,0.026516,0.111367,0.011148,0.012818,0.003504,,
2,2,2024-05-17 12:34:46.014853954,2024-05-17 12:34:46.100283861,2024-05-17 12:34:46.014853954,2024-05-17 12:34:46.100282907,2024-05-17 12:34:46.017874956,2024-05-17 12:34:46.061367035,2024-05-17 12:34:46.061388016,2024-05-17 12:34:46.088165045,2024-05-17 12:34:46.088674068,...,logs/mdgan.4.cifar.server.logs.json,0.085430,0.085429,0.043492,0.026777,0.008720,0.002788,0.003018,,
3,3,2024-05-17 12:34:46.100704908,2024-05-17 12:34:46.175889015,2024-05-17 12:34:46.100704908,2024-05-17 12:34:46.175887823,2024-05-17 12:34:46.102574110,2024-05-17 12:34:46.137769938,2024-05-17 12:34:46.137792110,2024-05-17 12:34:46.163587093,2024-05-17 12:34:46.164052963,...,logs/mdgan.4.cifar.server.logs.json,0.075184,0.075183,0.035196,0.025795,0.008942,0.002778,0.001868,,
4,4,2024-05-17 12:34:46.176311970,2024-05-17 12:34:46.251440048,2024-05-17 12:34:46.176311970,2024-05-17 12:34:46.251440048,2024-05-17 12:34:46.177803993,2024-05-17 12:34:46.214215994,2024-05-17 12:34:46.214241982,2024-05-17 12:34:46.239002943,2024-05-17 12:34:46.239430904,...,logs/mdgan.4.cifar.server.logs.json,0.075128,0.075128,0.036412,0.024761,0.009144,0.002746,0.001490,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,2024-05-17 12:34:56.746373892,2024-05-17 12:34:56.815877914,2024-05-17 12:34:56.746373892,2024-05-17 12:34:56.815877914,2024-05-17 12:34:56.747829914,2024-05-17 12:34:56.776414871,2024-05-17 12:34:56.776487827,2024-05-17 12:34:56.803265095,2024-05-17 12:34:56.803734064,...,logs/mdgan.4.cifar.server.logs.json,0.069504,0.069504,0.028585,0.026777,0.009355,0.002688,0.001455,,
96,96,2024-05-17 12:34:56.818325043,2024-05-17 12:34:56.892124891,2024-05-17 12:34:56.818325043,2024-05-17 12:34:56.892122984,2024-05-17 12:34:56.820090771,2024-05-17 12:34:56.852176905,2024-05-17 12:34:56.852235079,2024-05-17 12:34:56.878489017,2024-05-17 12:34:56.879183769,...,logs/mdgan.4.cifar.server.logs.json,0.073800,0.073798,0.032086,0.026254,0.010043,0.002779,0.001764,,
97,97,2024-05-17 12:34:56.894564152,2024-05-17 12:34:56.965866089,2024-05-17 12:34:56.894564152,2024-05-17 12:34:56.965864897,2024-05-17 12:34:56.895877123,2024-05-17 12:34:56.927735090,2024-05-17 12:34:56.927797079,2024-05-17 12:34:56.953094006,2024-05-17 12:34:56.953671932,...,logs/mdgan.4.cifar.server.logs.json,0.071302,0.071301,0.031858,0.025297,0.009421,0.002649,0.001311,,
98,98,2024-05-17 12:34:56.968370914,2024-05-17 12:34:57.044190884,2024-05-17 12:34:56.968370914,2024-05-17 12:34:57.044189930,2024-05-17 12:34:56.969718933,2024-05-17 12:34:57.002426147,2024-05-17 12:34:57.002522945,2024-05-17 12:34:57.028800964,2024-05-17 12:34:57.029929876,...,logs/mdgan.4.cifar.server.logs.json,0.075820,0.075819,0.032707,0.026278,0.009700,0.004461,0.001347,,


In [6]:
data_size = server_df["size.data"].iloc[0]
feedback_size = server_df["size.feedback"].iloc[0]
model_size = workers_df["size.model"].iloc[0]

print(f"Data size: {data_size:.2f}MB")
print(f"Feedback size: {feedback_size:.2f}MB")
print(f"Model size: {model_size:.2f}MB")

Data size: 19.66MB
Feedback size: 0.49MB
Model size: 2.53MB


In [7]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))
logs_standalones

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: 2 days 14:14:16.097607374
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.is,fid,is,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is
0,0,2024-05-17 12:34:41.268100841,2024-05-17 12:34:46.028608186,2024-05-17 12:34:41.268102033,2024-05-17 12:34:41.800711973,0,1.472082,2.691529,2024-05-17 12:34:41.800711973,NaT,...,2024-05-17 12:34:46.028537853,427.379852,1.097491,cifar,cifar.standalone.logs.json,4.760507,0.532610,,3.541883,0.665565
1,1,2024-05-17 12:34:46.063891990,2024-05-17 12:34:46.118600948,2024-05-17 12:34:46.063893182,2024-05-17 12:34:46.118596895,1,1.784279,2.096194,2024-05-17 12:34:46.118598087,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.054709,0.054704,,,
2,2,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208524091,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208520992,2,1.396755,2.365873,2024-05-17 12:34:46.208520992,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.034941,0.034938,,,
3,3,2024-05-17 12:34:46.257329090,2024-05-17 12:34:46.284704073,2024-05-17 12:34:46.257330043,2024-05-17 12:34:46.284700019,3,1.701418,2.404415,2024-05-17 12:34:46.284700973,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.027375,0.027370,,,
4,4,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342599018,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342596872,4,1.705129,2.500046,2024-05-17 12:34:46.342596872,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.023476,0.023474,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2024-05-17 12:37:06.720651014,2024-05-17 12:37:06.748970134,2024-05-17 12:37:06.720651014,2024-05-17 12:37:06.748961790,995,0.500290,3.374500,2024-05-17 12:37:06.748962982,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.028319,0.028311,,,
996,996,2024-05-17 12:37:06.793279035,2024-05-17 12:37:06.827307089,2024-05-17 12:37:06.793279989,2024-05-17 12:37:06.827303989,996,0.727101,2.680925,2024-05-17 12:37:06.827303989,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.034028,0.034024,,,
997,997,2024-05-17 12:37:06.868501051,2024-05-17 12:37:06.892108066,2024-05-17 12:37:06.868501051,2024-05-17 12:37:06.892106159,997,1.054406,1.746657,2024-05-17 12:37:06.892106159,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.023607,0.023605,,,
998,998,2024-05-17 12:37:06.944162948,2024-05-17 12:37:06.975633962,2024-05-17 12:37:06.944162948,2024-05-17 12:37:06.975630863,998,1.092340,1.281015,2024-05-17 12:37:06.975630863,NaT,...,NaT,,,cifar,cifar.standalone.logs.json,0.031471,0.031468,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-17 12:34:41.268100841,2024-05-17 12:34:46.028608186,epoch,4.760507,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-17 12:34:46.063891990,2024-05-17 12:34:46.118600948,epoch,0.054709,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208524091,epoch,0.034941,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-17 12:34:46.257329090,2024-05-17 12:34:46.284704073,epoch,0.027375,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342599018,epoch,0.023476,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
750,2024-05-17 12:36:32.143970831,2024-05-17 12:36:32.740421875,is,0.596451,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),750
800,2024-05-17 12:36:39.885171993,2024-05-17 12:36:40.473644121,is,0.588472,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),800
850,2024-05-17 12:36:47.621424062,2024-05-17 12:36:48.192218883,is,0.570795,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),850
900,2024-05-17 12:36:55.328119142,2024-05-17 12:36:55.879536016,is,0.551417,cifar.standalone.logs.json,is (logs/cifar.standalone.logs.json),900


In [8]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-17 12:34:41.268100841,2024-05-17 12:34:46.028608186,epoch,4.760507,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),0
1,2024-05-17 12:34:46.063891990,2024-05-17 12:34:46.118600948,epoch,0.054709,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),1
2,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208524091,epoch,0.034941,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),2
3,2024-05-17 12:34:46.257329090,2024-05-17 12:34:46.284704073,epoch,0.027375,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),3
4,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342599018,epoch,0.023476,cifar.standalone.logs.json,epoch (logs/cifar.standalone.logs.json),4
...,...,...,...,...,...,...,...
99,2024-05-17 12:34:57.046897888,2024-05-17 12:34:57.048532009,generate_data,0.001634,mdgan.4.cifar.server.logs.json,generate_data (logs/mdgan.4.cifar.server.logs....,99
0,2024-05-17 12:34:42.691653013,2024-05-17 12:34:45.824069977,fid,3.132417,mdgan.4.cifar.server.logs.json,fid (logs/mdgan.4.cifar.server.logs.json),0
50,2024-05-17 12:34:50.220789909,2024-05-17 12:34:53.242014885,fid,3.021225,mdgan.4.cifar.server.logs.json,fid (logs/mdgan.4.cifar.server.logs.json),50
0,2024-05-17 12:34:42.219161034,2024-05-17 12:34:42.691653013,is,0.472492,mdgan.4.cifar.server.logs.json,is (logs/mdgan.4.cifar.server.logs.json),0


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.send_data,start.apply_gradients,end.apply_gradients,start.generate_data,end.generate_data,size.data,size.feedback,time_elapsed.send_data,time_elapsed.apply_gradients,time_elapsed.generate_data
0,0,2024-05-17 12:34:41.268100841,2024-05-17 12:34:46.028608186,2024-05-17 12:34:41.268102033,2024-05-17 12:34:41.800711973,0.0,1.472082,2.691529,2024-05-17 12:34:41.800711973,NaT,...,NaT,NaT,NaT,NaT,NaT,,,,,
1,1,2024-05-17 12:34:46.063891990,2024-05-17 12:34:46.118600948,2024-05-17 12:34:46.063893182,2024-05-17 12:34:46.118596895,1.0,1.784279,2.096194,2024-05-17 12:34:46.118598087,NaT,...,NaT,NaT,NaT,NaT,NaT,,,,,
2,2,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208524091,2024-05-17 12:34:46.173582895,2024-05-17 12:34:46.208520992,2.0,1.396755,2.365873,2024-05-17 12:34:46.208520992,NaT,...,NaT,NaT,NaT,NaT,NaT,,,,,
3,3,2024-05-17 12:34:46.257329090,2024-05-17 12:34:46.284704073,2024-05-17 12:34:46.257330043,2024-05-17 12:34:46.284700019,3.0,1.701418,2.404415,2024-05-17 12:34:46.284700973,NaT,...,NaT,NaT,NaT,NaT,NaT,,,,,
4,4,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342599018,2024-05-17 12:34:46.319123132,2024-05-17 12:34:46.342596872,4.0,1.705129,2.500046,2024-05-17 12:34:46.342596872,NaT,...,NaT,NaT,NaT,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,2024-05-17 12:34:56.746373892,2024-05-17 12:34:56.815877914,2024-05-17 12:34:56.746373892,2024-05-17 12:34:56.815877914,,,,NaT,NaT,...,2024-05-17 12:34:56.776414871,2024-05-17 12:34:56.813189030,2024-05-17 12:34:56.815877199,2024-05-17 12:34:56.746374846,2024-05-17 12:34:56.747829914,19.6608,0.49152,0.028585,0.002688,0.001455
96,96,2024-05-17 12:34:56.818325043,2024-05-17 12:34:56.892124891,2024-05-17 12:34:56.818325043,2024-05-17 12:34:56.892122984,,,,NaT,NaT,...,2024-05-17 12:34:56.852176905,2024-05-17 12:34:56.889343977,2024-05-17 12:34:56.892122984,2024-05-17 12:34:56.818325996,2024-05-17 12:34:56.820090055,19.6608,0.49152,0.032086,0.002779,0.001764
97,97,2024-05-17 12:34:56.894564152,2024-05-17 12:34:56.965866089,2024-05-17 12:34:56.894564152,2024-05-17 12:34:56.965864897,,,,NaT,NaT,...,2024-05-17 12:34:56.927735090,2024-05-17 12:34:56.963215113,2024-05-17 12:34:56.965863943,2024-05-17 12:34:56.894565105,2024-05-17 12:34:56.895876169,19.6608,0.49152,0.031858,0.002649,0.001311
98,98,2024-05-17 12:34:56.968370914,2024-05-17 12:34:57.044190884,2024-05-17 12:34:56.968370914,2024-05-17 12:34:57.044189930,,,,NaT,NaT,...,2024-05-17 12:34:57.002426147,2024-05-17 12:34:57.039728165,2024-05-17 12:34:57.044188976,2024-05-17 12:34:56.968371868,2024-05-17 12:34:56.969718933,19.6608,0.49152,0.032707,0.004461,0.001347


In [9]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [10]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [11]:
mean_time_elapsed = workers_events_df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="event", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="event", title="Mean time elapsed", template="plotly_white").show()

In [12]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
)

timeline.show()