# 📈 Experiments

This notebook analyses the results of experiments as tracked to W&B.


## Setup 

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import autorootcwd

In [None]:
# Imports
from typing import Dict

import wandb
from wandb.sdk.wandb_run import Run

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Constants
WANDB_ENTITY = "mikasenghaas"
WANDB_PROJECT = "swarm"

In [None]:
# Helpers
def get_gpu(run: Run) -> str:
    if "gpu_nvidia" in run.metadata:
        gpu = run.metadata["gpu_nvidia"][0]
        return {"name": gpu["name"], "memory": gpu["memoryTotal"], "count": len(run.metadata["gpu_nvidia"])}
    elif "gpuapple" in run.metadata:
        return {"name": run.metadata["gpuapple"]["gpuType"], "count": 1}
    else:
        return {"name": "Unknown"}

def get_config(run: Run) -> Dict:
    return {**run.config, "gpu": get_gpu(run)}

def get_history(run: Run) -> pd.DataFrame:
    run_id = run.id
    history = run.history()
    return pd.concat([pd.Series([run_id]*len(history), name="run_id"), history], axis=1).set_index("run_id")

def get_summary(run: Run) -> pd.Series:
    return pd.DataFrame([dict(run.summary)], index=[run.id])


In [None]:
# Styling
sns.set_theme(style="whitegrid")
sns.set_palette("Blues_r")

In [None]:
# Initialize W&B
api = wandb.Api()

# Get runs
RUNS = api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
print(f"✅ Loaded {len(RUNS)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT})")

## Experiment 1: Verify Gradient Accumulation

This experiment verifies that gradient accumulation works as expected. We do so by training a model based on the debug configuration with different micro-batch sizes and the same global batch size locally (Apple M1).

View the experiment: [W&B](https://wandb.ai/mikasenghaas/swarm/workspace?nw=dm6rh6z8t14)

In [None]:
# Load runs
GROUP = "verify/grad-acc"
EXP1_RUNS = [r for r in RUNS if r.group == GROUP]

print(f"✅ Loaded {len(EXP1_RUNS)} runs for experiment {GROUP}")

In [None]:
# Get config, summary, history
runs_config = {r.id: get_config(r) for r in EXP1_RUNS}
runs_summary = pd.concat([get_summary(r) for r in EXP1_RUNS])
runs_history = pd.concat([get_history(r) for r in EXP1_RUNS])

In [None]:
# Plot loss by step
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16, 4), dpi=300)
sns.lineplot(data=runs_history, x="_step", y="train/loss/current", hue="run_id", marker="o", ax=ax[0])
sns.lineplot(data=runs_history, x="_step", y="train/loss/average", hue="run_id", marker="o", ax=ax[1])
ax[0].set_title("Loss by Step")
ax[1].set_title("Loss by Step (Average)")
for a in ax:
    a.set_xlabel("Step")
    a.set_ylabel("Loss")
plt.show();

Nice, gradient accumulation works. For every step, we are accumulating gradients over various micro-batches, and the we perform the same gradient updates.

In [None]:
# Plot Wall-Time by Run
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16, 4), dpi=300)
sns.barplot(data=runs_summary, x=runs_summary.index, y="_runtime", ax=ax[0])
sns.barplot(data=runs_summary, x=runs_summary.index, y="train/throughput/average", ax=ax[1])
ax[0].set_title("Wall-Time by Run")
ax[1].set_title("Throughput by Run")
ax[0].set_ylabel("Wall-Time (s)")
ax[1].set_ylabel("Throughput (T/s)")
for a in ax:
    a.set_xlabel("Micro-Batch Size")
    a.set_xticks(range(len(runs_summary)))
    a.set_xticklabels([runs_config[run_id]['train']['micro_batch_size'] for run_id in runs_summary.index]);
plt.show();

We see that the wall-time decreases with increasing micro-batch size, as expected. This is, because we are processing more tokens per second (using GPU hardware more efficiently).

## Experiment 2: Cosine LR Scheduler

This experiment verifies that the cosine learning rate scheduling works as expected, e.g. the learning rate is 0 at the start, then linearly increases for `train.scheduler.warmup_steps`, after which the cosine schedule kicks in and the learning rate decays according to a cosine annealing pattern until it reaches a minimum learning rate of `train.scheduler.min_lr_factor` of the initial learning rate. The experiment is run with the debug configuration from the script `experiments/verify/scheduler.sh` and run locally on an Apple M1.

View the experiment: [W&B](https://wandb.ai/mikasenghaas/swarm/workspace)

In [None]:
# Load runs
GROUP = "verify/scheduler"
EXP2_RUNS = [r for r in RUNS if r.group == GROUP]

print(f"✅ Loaded {len(EXP2_RUNS)} runs for experiment {GROUP}")

In [None]:
# Get config, summary, history
runs_config = {r.id: get_config(r) for r in EXP2_RUNS}
runs_summary = pd.concat([get_summary(r) for r in EXP2_RUNS])
runs_history = pd.concat([get_history(r) for r in EXP2_RUNS])

In [None]:
# Plot learning rate patterns
fig, ax = plt.subplots(figsize=(12, 6), dpi=300)
sns.lineplot(data=runs_history, x="_step", y="train/learning_rate/current", hue="run_id", ax=ax)
ax.set_title("Learning Rate by Step (All Runs)")
ax.set_xlabel("Step")
ax.set_ylabel("Learning Rate")
plt.legend(title="Run ID")

# Create custom legend with scheduler configuration
run_ids = runs_config.keys()
enable = [runs_config[run_id]['train']['scheduler']['enable'] for run_id in run_ids]
warmup_steps = [runs_config[run_id]['train']['scheduler']['warmup_steps'] for run_id in run_ids]
min_lr_factor = [runs_config[run_id]['train']['scheduler']['min_lr_factor'] for run_id in run_ids]

legend_elements = []
for run_id, e, w, m in zip(run_ids, enable, warmup_steps, min_lr_factor):
    color = ax.get_lines()[list(run_ids).index(run_id)].get_color()
    legend_elements.append(plt.Line2D([0], [0], color=color, lw=2, label=f"{run_id} (enable={e}, warmup_steps={w}, min_lr_factor={m})"))

ax.legend(handles=legend_elements, title="Scheduler Config")

plt.tight_layout()
plt.show();

Nice, looks good. The hyperparameter affect the learning rate pattern as expected:

- `enable`: The learning rate is constant at the initial learning rate for `False` and otherwise follows a cosine annealing pattern.
- `warmup_steps`: The learning rate is linearly increased from the initial learning rate to the maximum learning rate over `warmup_steps` steps.
- `min_lr_factor`: The learning rate is multiplied by `min_lr_factor` at the end of the training.

## Experiment 3: Mixed Precision Training

This experiment verifies that mixed precision training works as expected, i.e. that we can train a model with lower precision matrix multiplication and in `bfloat16` instead of `float32` without loosing too much performance. The experiment is run with the debug configuration from the script `experiments/verify/mp.sh` and run locally on an NVIDIA RTX 4090.

View the experiment: [W&B]()

In [None]:
# Load runs
GROUP = "verify/mp"
EXP3_RUNS = [r for r in RUNS if r.group == GROUP]

print(f"✅ Loaded {len(EXP3_RUNS)} runs for experiment {GROUP}")

In [None]:
# Get config, summary, history
runs_config = {r.id: get_config(r) for r in EXP3_RUNS}
runs_summary = pd.concat([get_summary(r) for r in EXP3_RUNS])
runs_history = pd.concat([get_history(r) for r in EXP3_RUNS])

In [None]:
# Plot loss by step
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16, 4), dpi=300)
sns.lineplot(data=runs_history, x="_step", y="train/loss/current", hue="run_id", marker="o", ax=ax[0])
sns.lineplot(data=runs_history, x="_step", y="train/loss/average", hue="run_id", marker="o", ax=ax[1])

# Create a custom legend
legend_elements = []
for run_id in runs_history.index.unique():
    config = runs_config[run_id]
    precision = config['train']['precision']
    autocast = config['train']['autocast']
    color = ax[0].get_lines()[list(runs_history.index.unique()).index(run_id)].get_color()
    legend_elements.append(plt.Line2D([0], [0], color=color, lw=2, label=f"{run_id} (precision={precision}, autocast={autocast})"))

# Remove existing legends
ax[0].get_legend().remove()
ax[1].get_legend().remove()

# Add shared legend
fig.legend(handles=legend_elements, title="Run Configuration", loc="upper right", bbox_to_anchor=(.99, .96))

# Adjust layout to make room for the legend
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

ax[0].set_title('Loss')
ax[1].set_title('Average Loss')

plt.show()

In [None]:
# Plot loss by step
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16, 4), dpi=300)
sns.lineplot(data=runs_history, x="_step", y="train/throughput/current", hue="run_id", ax=ax[0])
sns.lineplot(data=runs_history, x="_step", y="train/throughput/average", hue="run_id", ax=ax[1])

# Create a custom legend
legend_elements = []
for run_id in runs_history.index.unique():
    config = runs_config[run_id]
    precision = config['train']['precision']
    autocast = config['train']['autocast']
    color = ax[0].get_lines()[list(runs_history.index.unique()).index(run_id)].get_color()
    legend_elements.append(plt.Line2D([0], [0], color=color, lw=2, label=f"{run_id} (precision={precision}, autocast={autocast})"))

# Remove existing legends
ax[0].get_legend().remove()
ax[1].get_legend().remove()

# Add shared legend
fig.legend(handles=legend_elements, title="Run Configuration", loc="lower right", bbox_to_anchor=(0.99, .2))

# Adjust layout to make room for the legend
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

ax[0].set_title('Throughput')
ax[1].set_title('Average Throughput')

plt.show()

The optimisation are equivalent, which is weird. Given that we are varying the precision of the matrix multiplication, activations and gradients, there should be a difference. For now, the throughput is high enough, so we will only come back to this if the performance is an issue. At this point, I delete the `autocast` flag from the codebase, and re-run the `experiments/verify/mp.sh` script only for different values of internal precision.

In [None]:
# Load runs
GROUP = "verify/mp2"
EXP4_RUNS = [r for r in RUNS if r.group == GROUP]

print(f"✅ Loaded {len(EXP4_RUNS)} runs for experiment {GROUP}")

In [None]:
# Get config, summary, history
runs_config = {r.id: get_config(r) for r in EXP4_RUNS}
runs_summary = pd.concat([get_summary(r) for r in EXP4_RUNS])
runs_history = pd.concat([get_history(r) for r in EXP4_RUNS])

In [None]:
# Plot loss by step
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16, 4), dpi=300)
sns.lineplot(data=runs_history, x="_step", y="train/throughput/current", hue="run_id", ax=ax[0])
sns.lineplot(data=runs_history, x="_step", y="train/throughput/average", hue="run_id", ax=ax[1])

# Create a custom legend
legend_elements = []
for run_id in runs_history.index.unique():
    config = runs_config[run_id]
    precision = config['train']['precision']
    color = ax[0].get_lines()[list(runs_history.index.unique()).index(run_id)].get_color()
    legend_elements.append(plt.Line2D([0], [0], color=color, lw=2, label=f"{run_id} (precision={precision})"))

# Remove existing legends
ax[0].get_legend().remove()
ax[1].get_legend().remove()

# Add shared legend
fig.legend(handles=legend_elements, title="Run Configuration", loc="lower right", bbox_to_anchor=(0.99, .2))

# Adjust layout to make room for the legend
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)

ax[0].set_title('Throughput')
ax[1].set_title('Average Throughput')

plt.show()

Even for the `precision` flag set via `torch.set_float32_matmul_precision(precision)`, the performance is the same. Weird...

## Experiment 4: GPU Benchmark

In this experiment, we are benchmarking the performances of various GPUs on the [Prime Intellect Compute](https://api.primeintellect.ai) platform. Namely, we are comparing the following GPUs:

- Apple ARM M1 (8GB)
- NVIDIA RTX 4090 (24GB)
- NVIDIA A100 (40GB)
- NVIDIA A100 (80GB)
- NVIDIA H100 (80GB)

We are using the script `experiments/verify/perf.sh` to run the experiment. It uses the configuration from `configs/baseline/perf.yaml` and runs the `src/train/baseline.py` script. It trains the 14M Llama model on the entire train split of WikiText 2 (17.8M tokens), no intermediate evaluation but final test on WikiText 2 (2.2M tokens). We do not use learning rate scheduling and test for various micro batch sizes, starting from 1 up to 128 (or when reaching OOM).

View the experiment: [W&B](https://wandb.ai/mikasenghaas/swarm/workspace?nw=5p39zizreht)

In [None]:
# Load runs
GROUP = "verify/perf"
EXP5_RUNS = [r for r in RUNS if r.group == GROUP and "hidden" not in r.tags and r.state == "finished"]
 
print(f"✅ Loaded {len(EXP5_RUNS)} runs for experiment {GROUP}")

In [None]:
# Get config, summary, history
runs_config = {r.id: get_config(r) for r in EXP5_RUNS}
runs_summary = pd.concat([get_summary(r) for r in EXP5_RUNS])
runs_history = pd.concat([get_history(r) for r in EXP5_RUNS])

In [None]:
# Construct performance dataframe
performance = runs_summary.copy()

# Add GPU type to summary and history
performance["gpu"] = runs_summary.index.map(lambda x: runs_config[x]["gpu"]["name"])

# Add micro-batch size to summary
performance["micro_batch_size"] = runs_summary.index.map(lambda x: str(runs_config[x]["train"]["micro_batch_size"]))

# Add peak throughput to summary
performance["train/throughput/max"] = runs_summary.index.map(lambda x: runs_history[runs_history.index == x]["train/throughput/current"].max())

performance.head()

In [None]:
# Plot the average throughput per micro-batch size
fig, ax = plt.subplots(figsize=(16, 4), dpi=300)
stats = performance.groupby("gpu")["train/throughput/average"].describe().sort_values(by="mean", ascending=False)
sns.boxplot(data=performance, x="gpu", y="train/throughput/average", order=stats.index, ax=ax)
ax.set_title("Average Throughput per GPU")
ax.set_xlabel("GPU")
ax.set_ylabel("Average Throughput (kT/s)")
ax.yaxis.set_major_formatter(lambda x, p: f'{x/1000:.0f}')
plt.xticks(rotation=10, ha='center')
plt.show();

stats

In [None]:
# Plot the average throughput per micro-batch size and GPU
fig, ax = plt.subplots(nrows=2, figsize=(16, 8), dpi=300)
fig.suptitle("Average Throughput per GPU and Micro-Batch Size")
stats = performance.groupby(["gpu", "micro_batch_size"])["train/throughput/average"].mean()
gpu_order = performance.groupby("gpu")["train/throughput/average"].mean().sort_values(ascending=True).index
batch_size_order = [str(2**i) for i in range(7)]
colors = sns.color_palette("Blues", n_colors=7)
sns.barplot(data=performance, x="gpu", y="train/throughput/average", hue="micro_batch_size", order=gpu_order, hue_order=batch_size_order, ax=ax[0], gap=0.2, palette=colors)
ax[0].set_xlabel("GPU")
ax[0].set_ylabel("Average Throughput (kT/s)")
ax[0].yaxis.set_major_formatter(lambda x, p: f'{x/1000:.0f}')
ax[0].legend(title="Micro-Batch Size")

colors = sns.color_palette("Blues", n_colors=5)
sns.barplot(data=performance, x="micro_batch_size", y="train/throughput/average", hue="gpu", order=batch_size_order, hue_order=gpu_order, ax=ax[1], gap=0.2, palette=colors)
ax[1].set_xlabel("Micro-Batch Size")
ax[1].set_ylabel("Average Throughput (kT/s)")
ax[1].yaxis.set_major_formatter(lambda x, p: f'{x/1000:.0f}')
ax[1].legend(title="GPU");
plt.tight_layout()
plt.show();