# Utility notebook to get the storage per project and user

<a target="_blank" href="https://colab.research.google.com/github/neptune-ai/examples/blob/main/utils/Get_storage_used_by_project_user.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a><a target="_blank" href="https://github.com/neptune-ai/examples/blob/main/utils/Get_storage_used_by_project_user.ipynb">
  <img alt="Open in GitHub" src="https://img.shields.io/badge/Open_in_GitHub-blue?logo=github&labelColor=black">
</a>

In [None]:
%pip install neptune tqdm

In [None]:
import os
import neptune
import pandas as pd
from neptune import management
from tqdm.auto import tqdm

## Set Neptune API token

In [None]:
if "NEPTUNE_API_TOKEN" not in os.environ:
    from getpass import getpass

    os.environ["NEPTUNE_API_TOKEN"] = getpass("Enter the Neptune API token you wish to use: ")

## Enter workspace to scan

In [None]:
workspace = input("Enter workspace name:")

## List all projects in the workspace
This will list all the projects in the workspace accessible by the NEPTUNE_API_TOKEN

In [None]:
projects = [
    project for project in management.get_project_list() if project.split("/")[0] == workspace
]
projects

## Get the runs, models, and model versions tables for all the projects

In [None]:
all_runs_df = all_models_df = all_model_versions_df = pd.DataFrame()

for project in tqdm(projects):
    with neptune.init_project(project=project, mode="read-only") as proj:
        runs_df = proj.fetch_runs_table(
            columns=["sys/owner", "sys/size"],
        ).to_pandas()

        models_df = proj.fetch_models_table(
            columns=["sys/owner", "sys/size"],
        ).to_pandas()

    if not runs_df.empty:
        runs_df = runs_df.sort_values(by=["sys/size"], ascending=False)
        runs_df["project"] = project

        all_runs_df = pd.concat([all_runs_df, runs_df], ignore_index=True)

    if not models_df.empty:
        models_df = models_df.sort_values(by=["sys/size"], ascending=False)
        models_df["project"] = project

        all_models_df = pd.concat([all_models_df, models_df], ignore_index=True)

        for model_id in tqdm(models_df["sys/id"]):
            with neptune.init_model(project=project, with_id=model_id, mode="read-only") as model:
                model_versions_df = model.fetch_model_versions_table(
                    columns=["sys/owner", "sys/size"],
                ).to_pandas()

                if not model_versions_df.empty:
                    model_versions_df = model_versions_df.sort_values(
                        by=["sys/size"], ascending=False
                    )

                    all_model_versions_df = pd.concat(
                        [all_model_versions_df, model_versions_df], ignore_index=True
                    )

                all_model_versions_df["project"] = project

In [None]:
all_runs_df["type"] = "run"
all_models_df["type"] = "model"
all_model_versions_df["type"] = "model_version"

all_objects_df = pd.concat([all_runs_df, all_models_df, all_model_versions_df], ignore_index=True)
all_objects_df.sort_values(by=["sys/size"], ascending=False, inplace=True)
all_objects_df.reset_index(drop=True, inplace=True)

all_objects_df["size_gb"] = all_objects_df["sys/size"] / (1024**3)
all_objects_df.drop("sys/size", axis=1, inplace=True)
all_objects_df

## Storage per project

In [None]:
all_objects_df.groupby("project", as_index=False)[["size_gb"]].sum().sort_values(
    "size_gb", ascending=False
).reset_index(drop=True)

## Storage per user

In [None]:
all_objects_df.groupby("sys/owner", as_index=False)[["size_gb"]].sum().sort_values(
    "size_gb", ascending=False
).reset_index(drop=True)