In [1]:
import wandb
from wandb.apis.public.runs import Run
from datetime import datetime, timedelta
import pandas as pd

PROJECT_URL = "https://wandb.ai/hyena/7b-context-extension"
ENTITY = "hyena"
PROJECT = "7b-context-extension"
runs: list = wandb.Api().runs(f"{ENTITY}/{PROJECT}")

In [8]:
pattern = "n32"
runs: list[Run] = [run for run in runs if pattern in run.name]

In [9]:
for run in runs:
    print(run.name, run.id)

n32-log-32K 20241109194350
n32-linear-32K 20241110010914
n32-evo1-32K 20241110022836
n32-5x-32K 20241110042338
n32-5x-32K 20241110054044


In [12]:
def convert_iso_to_dt(iso: str) -> datetime:
    return datetime.fromisoformat(iso)


HISTORY_KEYS = [
    "_step",
    "data/tokens_per_second_per_gpu",
    "runtime/iteration_time",
    "train/lm_loss",
    "validation/lm_loss",
]


def summarize_runs(runs: list[Run], num_samples=None, keys=HISTORY_KEYS):
    meta = []
    run_dict = {}
    for run in runs:
        metadata = run.metadata
        try:
            # Get start and end times
            start = metadata["startedAt"]
            # start = convert_iso_to_dt(start)
            #   start_key = start.strftime("%Y-%m-%d-%H:%M")
            # key = f"{run.name}_{start_key}"
            duration = timedelta(seconds=run.summary["_wandb"]["runtime"])
            #        end = start + duration
            #  end_key = end.strftime("%Y-%m-%d-%H:%M")

            # Get run history
            history = (
                pd.DataFrame(run.scan_history(keys=keys))
                if num_samples is None
                else run.history(keys=keys, samples=num_samples)
            )
            if len(history) == 0:
                continue
                print(f"Skipping {key} due to empty history")

            key = run.url
            run_dict[key] = {
                "start_dt": start,
                #           "end_dt": end,
                #     "start": start_key,
                #    "end": end_key,
                "metadata": metadata,
                "history": history,
            }

            # Print summary
            avg_tpt, avg_t = history.mean()[["data/tokens_per_second_per_gpu", "runtime/iteration_time"]]
            start_train_loss, end_train_loss = (
                history["train/lm_loss"].iloc[0],
                history["train/lm_loss"].iloc[-1],
            )
            start_val_loss, end_val_loss = (
                history["validation/lm_loss"].iloc[0],
                history["validation/lm_loss"].iloc[-1],
            )
            # Update run_dict with summary stats
            run_dict[key].update(
                {
                    "duration": duration.total_seconds() / 3600,
                    "avg_throughput": avg_tpt,
                    "avg_iteration_time": avg_t,
                    "start_train_loss": start_train_loss,
                    "end_train_loss": end_train_loss,
                    "start_val_loss": start_val_loss,
                    "end_val_loss": end_val_loss,
                }
            )

            header = f"{key}:\n Started at: {start} Duration: {duration.total_seconds()/3600:.2f}hrs"
            runtime = f"Avg Throughput: {avg_tpt:.2f} Avg Iteration Time: {avg_t:.2f}"
            losses = f"Start Train Loss: {start_train_loss:.2f} End Train Loss: {end_train_loss:.2f}\n Start Val Loss: {start_val_loss:.2f} End Val Loss: {end_val_loss:.2f}"
            print_str = "\n ".join([header, runtime, losses])
            print(print_str)
        except Exception as e:
            print(f"Error processing {run.name}: {e}")

    return run_dict

In [13]:
run_dict = summarize_runs(runs)

https://wandb.ai/hyena/7b-context-extension/runs/20241109194350:
 Started at: 2024-11-10T03:43:51.300021Z Duration: 7.24hrs
 Avg Throughput: 8163.33 Avg Iteration Time: 2.01
 Start Train Loss: 1.04 End Train Loss: 1.00
 Start Val Loss: 1.04 End Val Loss: 1.02
https://wandb.ai/hyena/7b-context-extension/runs/20241110010914:
 Started at: 2024-11-10T09:09:15.689373Z Duration: 6.52hrs
 Avg Throughput: 8188.92 Avg Iteration Time: 2.00
 Start Train Loss: 1.04 End Train Loss: 1.01
 Start Val Loss: 1.04 End Val Loss: 1.02
https://wandb.ai/hyena/7b-context-extension/runs/20241110022836:
 Started at: 2024-11-10T10:28:37.205489Z Duration: 5.20hrs
 Avg Throughput: 8191.73 Avg Iteration Time: 2.00
 Start Train Loss: 1.03 End Train Loss: 1.05
 Start Val Loss: 1.03 End Val Loss: 1.03
Error processing n32-5x-32K: '_wandb'


In [15]:
summary_df = pd.DataFrame(run_dict).T

In [17]:
summary_df.history.iloc[0]

Unnamed: 0,_step,data/tokens_per_second_per_gpu,runtime/iteration_time,train/lm_loss,validation/lm_loss
0,500,8166.196378,2.00632,1.042655,1.038076
1,1000,8142.628651,2.012127,1.042721,1.041353
2,1500,8166.511001,2.006242,1.047059,1.031628
3,2000,8162.448008,2.007241,1.031888,1.030769
4,2500,8161.976262,2.007357,1.029305,1.033385
5,3000,8162.131178,2.007319,1.055158,1.027669
6,3500,8164.764871,2.006671,1.029761,1.031546
7,4000,8159.314529,2.008012,1.013679,1.028609
8,4500,8168.378271,2.005784,1.043509,1.026533
9,5000,8156.239408,2.008769,1.031827,1.023422


In [20]:
d = summary_df.start_dt.iloc[-1]
d

'2024-11-04T08:17:39.311952Z'

In [21]:
from datetime import datetime
import pytz

# Original datetime string in UTC
utc_datetime_str = d
# Parse the string and set timezone to UTC
utc_datetime = datetime.strptime(utc_datetime_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)

# Convert to PST
pst_timezone = pytz.timezone("America/Los_Angeles")
pst_datetime = utc_datetime.astimezone(pst_timezone)

# Format to desired output
formatted_datetime = pst_datetime.strftime("%B %d, %Y at %I:%M:%S %p %Z")
print(formatted_datetime)

November 04, 2024 at 12:17:39 AM PST


In [None]:
# Convert to PST
pst_timezone = pytz.timezone("America/Los_Angeles")
pst_datetime = utc_datetime.astimezone(pst_timezone)

# Format to desired output
formatted_datetime = pst_datetime.strftime("%B %dth, %Y at %I:%M:%S %p %Z")
print(formatted_datetime)

In [107]:
# dt_fmt = "%Y-%m-%d-%H:%M"
# summary_df["start"] = summary_df["start"].map(lambda x: x.strftime(dt_fmt))
# summary_df["end"] = summary_df["end"].map(lambda x: x.strftime(dt_fmt))
summary_keys = list(set(summary_df.columns) - set(["history", "metadata"]))

In [111]:
col_order = [
    "start",
    "end",
    "duration",
    "avg_throughput",
    "avg_iteration_time",
    "start_train_loss",
    "end_train_loss",
    "start_val_loss",
    "end_val_loss",
]
summary_stats = summary_df[summary_keys][col_order]
summary_stats.to_csv("40b-train-summary.csv")
summary_df[col_order + ["metadata", "history"]].to_csv("40b-train-summary-full.csv")

In [112]:
summary_stats

Unnamed: 0,start,end,duration,avg_throughput,avg_iteration_time,start_train_loss,end_train_loss,start_val_loss,end_val_loss
https://wandb.ai/hyena/40b-train/runs/20241027162236,2024-10-27-23:22,2024-10-28-00:25,1.045,1355.045421,6.045603,1.383369,1.325034,1.363242,1.329777
https://wandb.ai/hyena/40b-train/runs/20241027172710,2024-10-28-00:27,2024-10-28-01:53,1.439444,1369.599623,5.98163,1.345629,1.278811,1.350023,1.290337
https://wandb.ai/hyena/40b-train/runs/20241028003618,2024-10-28-07:36,2024-10-28-15:35,7.982778,1372.898614,5.96716,1.4619,1.156094,1.526385,1.187329
https://wandb.ai/hyena/40b-train/runs/20241028090952,2024-10-28-16:09,2024-10-29-00:08,7.978611,1355.543759,6.043941,1.166044,1.109557,1.194547,1.143814
https://wandb.ai/hyena/40b-train/runs/20241028171048,2024-10-29-00:10,2024-10-29-00:39,0.473889,1359.966513,6.023678,1.11747,1.11747,1.147641,1.147641
https://wandb.ai/hyena/40b-train/runs/20241028174114,2024-10-29-00:41,2024-10-29-09:28,8.785833,1386.274006,5.909985,1.115309,1.081061,1.147572,1.117539
https://wandb.ai/hyena/40b-train/runs/20241029043756,2024-10-29-11:37,2024-10-30-05:00,17.381667,1404.561807,5.832629,1.07964,1.047377,1.117321,1.0855
https://wandb.ai/hyena/40b-train/runs/20241029220853,2024-10-30-05:08,2024-11-01-05:07,47.983889,1403.15805,5.83841,1.052863,1.023916,1.086303,1.056264
https://wandb.ai/hyena/40b-train/runs/20241031222654,2024-11-01-05:26,2024-11-03-05:26,47.985278,1381.53987,5.930307,1.020522,1.00833,1.055115,1.043902
https://wandb.ai/hyena/40b-train/runs/20241102234744,2024-11-03-06:47,2024-11-03-09:42,2.918611,1385.826899,5.911748,1.008339,1.008232,1.041133,1.041214


In [103]:
run.url

'https://wandb.ai/hyena/40b-train/runs/20241027162236'

In [31]:
metadata["startedAt"]
# Convert to datetime
from datetime import datetime

datetime.fromisoformat(metadata["startedAt"])

'2024-10-27T23:22:36.668030Z'

datetime.datetime(2024, 10, 27, 23, 22, 36, 668030, tzinfo=datetime.timezone.utc)

In [64]:
h = run.scan_history(keys=HISTORY_KEYS)

In [65]:
df = pd.DataFrame(h)

In [66]:
df.head()

Unnamed: 0,_step,data/tokens_per_second_per_gpu,runtime/iteration_time,train/lm_loss,validation/lm_loss
0,200,1351.188812,6.062809,1.383369,1.363242
1,400,1358.90203,6.028396,1.325034,1.329777


In [21]:
runtime = run.summary["_runtime"]
timestamp = run.summary["_timestamp"]

In [22]:
# convert timestamp to datetime
from datetime import datetime

timestamp = datetime.fromtimestamp(timestamp)

In [24]:
metadata.keys()

dict_keys(['os', 'python', 'startedAt', 'args', 'program', 'codePath', 'git', 'email', 'root', 'host', 'username', 'executable', 'codePathLocal', 'cpu_count', 'cpu_count_logical', 'gpu', 'gpu_count', 'disk', 'memory', 'cpu', 'gpu_nvidia', 'slurm', 'cudaVersion'])

In [30]:
duration = run.summary["_wandb"]["runtime"]