In [1]:
import yaml
import os

# Create an anemoi-dataset

Some information about the dataset
- content
- where it lives

## 1. Configure the creation of an anemoi-dataset
Creating an anemoi-dataset is done from the command line by running the `anemoi-datasets create` command and providing a config-file.  
For example:
```bash
anemoi-datasets create config.yaml name-of-output-dataset.zarr
```

In [2]:
# Config skeleton
config = dict()

### 1.1 General information

In [3]:
# anemoi-datasets use following naming conventions:
# purpose-content-source-resolution-start-year-end-year-frequency-version[-extra-str]
name = "keisler-era5-gs-weatherbench2-1p0d-1979-1979-v0"
# Set some general keys in the config
config["name"] = name
config["resolution"] = "1p0 degree"
config["description"] = "Dataset as used in the Keisler 2020 pre-print"

### 1.2 Temporal information


In [4]:
# Set the dates part of the config
dates = {
    "start": "1979-01-01T00:00:00",
    "end": "1979-01-01T23:00:00",
    "frequency": "3h",
}

# and add it to the config
config["dates"] = dates

### 1.3 The input variables
Each type of input is defined as a dictionary where the key defines the source-type of the input (see docs) and the value is a dictionary with settings specific to the source.

In [5]:
upper_air_variables = {
    "xarray-zarr": {
        "url": "gs://weatherbench2/datasets/era5/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr",
        "param": [
            "temperature",
            "u_component_of_wind",
            "geopotential",
            "vertical_velocity",
            "specific_humidity"
        ],
        "level": [50, 100, 150, 200, 250, 300,400, 500, 600, 700, 850, 925, 1000],
        "options": {
            "storage_options": {"token": "anon"}
        }
    }
}

# Certain variables are throughout the original dataset, but for ML-training it is
# easier to add these constant fields along the temporal dimension for easy loading during
# training.
# For this we can use the repeated-dates class
constant_variables = {
    "repeated-dates": {
        "mode": "constant",
        "source": {
            "xarray-zarr": {
                "url": "gs://weatherbench2/datasets/era5/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr",
                "param": ["geopotential_at_surface", "land_sea_mask"],
                "options": {
                    "storage_options": {
                        "token": "anon"
                    }
                }
            }
        }
    }
}

forcings = {
    "forcings": {
        "template": "${input.join.1.xarray-zarr}",
        "param": [
            "cos_latitude",
            "cos_longitude",
            "sin_latitude",
            "sin_longitude",
            "julian_day",
            "insolation",
        ]
    }
}

# We can combine all these datasets by joining them using the join key in a dictionary
input = {
    "join": [
        constant_variables,
        upper_air_variables,
        forcings
    ]
}

config["input"] = input

### 1.4 Build options
Settings controlling the building of the anemoi-dataset are set in the `build` part of the config


In [6]:
build = {
    "group_by": "daily",
    "variable_naming": "param_levelist",
}

config["build"] = build

### 1.5 Final result
Lets inspect the final config and save the dictionary as a yaml_file

In [7]:
print(
    yaml.dump(
        config,
        default_flow_style=False
    )
)

build:
  group_by: daily
  variable_naming: param_levelist
dates:
  end: '1979-01-01T23:00:00'
  frequency: 3h
  start: '1979-01-01T00:00:00'
description: Dataset as used in the Keisler 2020 pre-print
input:
  join:
  - repeated-dates:
      mode: constant
      source:
        xarray-zarr:
          options:
            storage_options:
              token: anon
          param:
          - geopotential_at_surface
          - land_sea_mask
          url: gs://weatherbench2/datasets/era5/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr
  - xarray-zarr:
      level:
      - 50
      - 100
      - 150
      - 200
      - 250
      - 300
      - 400
      - 500
      - 600
      - 700
      - 850
      - 925
      - 1000
      options:
        storage_options:
          token: anon
      param:
      - temperature
      - u_component_of_wind
      - geopotential
      - vertical_velocity
      - specific_humidity
      url: gs://weatherbench2/datasets/era5/1959-2022-1h-360x18

In [8]:
config_dir = "./configs"
config_path = os.path.join(config_dir,f"{name}.yaml")
with open(config_path, 'w') as file:
    yaml.dump(config, file, default_flow_style=False)

dataset_dir = "./datasets"
dataset_path = os.path.join(dataset_dir,f"{name}.zarr")

In [9]:
!anemoi-datasets create --overwrite {config_path} {dataset_path}

2025-05-22 14:05:38 INFO 🎬 Task init((),{}) starting
2025-05-22 14:05:39 INFO Setting flatten_grid=True in config
2025-05-22 14:05:39 INFO Setting ensemble_dimension=2 in config
2025-05-22 14:05:39 INFO Setting flatten_grid=True in config
2025-05-22 14:05:39 INFO Setting ensemble_dimension=2 in config
2025-05-22 14:05:39 INFO {'end': '1979-01-01T23:00:00', 'frequency': '3h', 'start': '1979-01-01T00:00:00', 'group_by': 'daily'}
2025-05-22 14:05:39 INFO Groups(dates=1,StartEndDates(1979-01-01 00:00:00..1979-01-01 23:00:00 every 3:00:00))
2025-05-22 14:05:39 INFO Groups: Groups(dates=1,StartEndDates(1979-01-01 00:00:00..1979-01-01 23:00:00 every 3:00:00))
2025-05-22 14:06:45 INFO Minimal input for 'init' step (using only the first date) : GroupOfDates(dates=['1979-01-01T00:00:00'])
2025-05-22 14:06:45 INFO JoinResult: 1 dates (1979-01-01T00:00)
  JoinResult: 1 dates (1979-01-01T00:00)
    DateMapperResult: 0 dates ()
      
  xarray-zarr(GroupOfDates(dates=['1979-01-01T00:00:00']))
  forc