# Notebook 3 - Get usage stats for a specific period

This example provides a code to compute usage statistics for a specific period on a list of clusters.

Letâ€™s first configure the `SARC_CONFIG` variable, as in notebook 1:

In [1]:
import os;
os.environ["SARC_CONFIG"] = "../../config/sarc-client.json";
print(os.environ["SARC_CONFIG"]);

../../config/sarc-client.json


Example will use pandas, which may print many warnings. Let's suppress them to get a more readable output:

In [2]:
import warnings
warnings.filterwarnings('ignore')

Then the main code, which is inspired from script `examples/usage_stats.py` in repository root.

To define the period, we will set a `start` time and `end` time using `datetime` class.

Note that this code may take time to run if clusters contain a lot of jobs.

In [3]:
import os
from datetime import datetime, timedelta

import pandas as pd

from sarc.config import MTL, config
from sarc.jobs import get_jobs

# Clusters for which we want to compute statistics. 
# For this example, we will use just 2 clusters.
clusters = ["mila", "narval"]

# Subset of slurm fields we need to compute the trends
include_fields = {
    "cluster_name",
    "user",
    "start_time",
    "end_time",
    "elapsed_time",
    "job_id",
    "array_job_id",
    "task_id",
    "qos",
    "partition",
}


def get_jobs_dataframe(start, end) -> pd.DataFrame:

    df = None
    # Fetch all jobs from the clusters
    for c, cluster in enumerate(clusters):
        print("Getting job for cluster", cluster, f"({c + 1} / {len(clusters)})")
        dicts = []

        # Precompute the total number of jobs to display a progress bar
        # get_jobs is a generator so we don't get the total unless we pre-fetch all jobs
        # beforehand.
        total = config().mongo.database_instance.jobs.count_documents(
            {
                "cluster_name": cluster,
                "end_time": {"$gte": start},
                "start_time": {"$lt": end},
            }
        )

        for i, job in enumerate(get_jobs(cluster=cluster, start=start, end=end)):
            if (i + 1) % 100000 == 0:
                print(f"[{cluster}] Getting jobs {i + 1} / {total}")

            if job.elapsed_time <= 0:
                continue

            if job.end_time is None:
                job.end_time = datetime.now(tz=MTL)

            # For some reason start time is not reliable, often equal to submit time,
            # so we infer it based on end_time and elapsed_time.
            job.start_time = job.end_time - timedelta(seconds=job.elapsed_time)

            # Clip the job to the time range we are interested in.
            if job.start_time < start:
                job.start_time = start
            if job.end_time > end:
                job.end_time = end
            job.elapsed_time = (job.end_time - job.start_time).total_seconds()

            # We only care about jobs that actually ran.
            if job.elapsed_time <= 0:
                continue

            # Create a small dict with the fields we need
            job_dict = job.dict(include=include_fields)
            # Add the allocation fields directry to dicts instead of nested as in the original job dict.
            job_dict.update(job.allocated.dict())

            dicts.append(job_dict)

        print(f"[{cluster}] Got jobs {total} / {total}")

        # Replace all NaNs by 0.
        cluster_df = pd.DataFrame(dicts).fillna(0)
        df = pd.concat([df, cluster_df])

    assert isinstance(df, pd.DataFrame)

    return df


start = datetime(year=2022, month=1, day=1, tzinfo=MTL)
end = datetime(year=2023, month=1, day=1, tzinfo=MTL)
df = get_jobs_dataframe(start=start, end=end)

# Compute the billed and used resource time in seconds
df["billed"] = df["elapsed_time"] * df["billing"]
df["used"] = df["elapsed_time"] * df["gres_gpu"]

df_mila = df[df["cluster_name"] == "mila"]
df_drac = df[df["cluster_name"] != "mila"]

print("Number of jobs:")
print("Mila-cluster", df_mila.shape[0])
print("DRAC clusters", df_drac.shape[0])

print("GPU hours:")
print("Mila-cluster", df_mila["used"].sum() / (3600))
print("DRAC clusters", df_drac["used"].sum() / (3600))


def compute_gpu_hours_per_duration(df):
    categories = {
        "< 1hour": (0, 3600),
        "1-24 hours": (3600, 24 * 3600),
        "1-28 days": (24 * 3600, 28 * 24 * 3600),
        ">= 28 days": (28 * 24 * 3600, None),
    }
    for key, (min_time, max_time) in categories.items():
        condition = df["elapsed_time"] >= min_time
        if max_time is not None:
            condition *= df["elapsed_time"] < max_time
        df[key] = condition.astype(bool) * df["used"]

    return df[list(categories.keys())].sum() / df["used"].sum()


print("GPU hours per job duration")
print("Mila-cluster:")
print(compute_gpu_hours_per_duration(df_mila))
print("DRAC clusters:")
print(compute_gpu_hours_per_duration(df_drac))


def compute_jobs_per_gpu_hours(df):
    categories = {
        "< 1 GPUhour": (0, 3600),
        "1-24 GPUhours": (3600, 24 * 3600),
        "1-28 GPUdays": (24 * 3600, 28 * 24 * 3600),
        ">= 28 GPUdays": (28 * 24 * 3600, None),
    }
    for key, (min_time, max_time) in categories.items():
        condition = df["used"] >= min_time
        if max_time is not None:
            condition *= df["used"] < max_time
        df[key] = condition.astype(bool) * df["used"]

    return df[list(categories.keys())].sum() / df["used"].sum()


print("Binned GPU hours")
print("Mila-cluster:")
print(compute_jobs_per_gpu_hours(df_mila))
print("DRAC clusters:")
print(compute_jobs_per_gpu_hours(df_drac))


def compute_gpu_hours_per_gpu_count(df):
    categories = {
        "1 GPU": (1, 2),
        "2-4 GPUs": (2, 5),
        "5-8 GPUs": (5, 9),
        "9-32 GPUs": (9, 33),
        ">= 33 PUdays": (33, None),
    }
    for key, (min_time, max_time) in categories.items():
        condition = df["gres_gpu"] >= min_time
        if max_time is not None:
            condition *= df["gres_gpu"] < max_time
        df[key] = condition.astype(bool) * df["used"]

    return df[list(categories.keys())].sum() / df["used"].sum()


print("GPU hours per gpu job count")
print("Mila-cluster:")
print(compute_gpu_hours_per_gpu_count(df_mila))
print("DRAC clusters:")
print(compute_gpu_hours_per_gpu_count(df_drac))

Getting job for cluster mila (1 / 2)
[mila] Getting jobs 100000 / 929234
[mila] Getting jobs 200000 / 929234
[mila] Getting jobs 300000 / 929234
[mila] Getting jobs 400000 / 929234
[mila] Getting jobs 500000 / 929234
[mila] Getting jobs 600000 / 929234
[mila] Getting jobs 700000 / 929234
[mila] Getting jobs 800000 / 929234
[mila] Getting jobs 900000 / 929234
[mila] Got jobs 929234 / 929234
Getting job for cluster narval (2 / 2)
[narval] Getting jobs 100000 / 652510
[narval] Getting jobs 200000 / 652510
[narval] Getting jobs 300000 / 652510
[narval] Getting jobs 400000 / 652510
[narval] Getting jobs 500000 / 652510
[narval] Getting jobs 600000 / 652510
[narval] Got jobs 652510 / 652510
Number of jobs:
Mila-cluster 743251
DRAC clusters 562930
GPU hours:
Mila-cluster 3054313.8527777777
DRAC clusters 1528260.2416666667
GPU hours per job duration
Mila-cluster:
< 1hour       0.029210
1-24 hours    0.461250
1-28 days     0.443574
>= 28 days    0.065966
dtype: float64
DRAC clusters:
< 1hour   