# ERASTAR : Génération du datacube

<div class="alert alert-danger">
Attention ce tutoriel repose sur l'ancienne version de Pangeo-Forge, une mise à jour majeur de la librairie rend le code obsolète à patir de la version 0.10
</div>

Utiliser l'environnement: *pangeo-forge-recipes-0.9-env*

In [2]:
import os
import intake
import fsspec
import logging
import xarray as xr
import pandas as pd
from glob import glob
from pathlib import Path

from pangeo_forge_recipes.patterns import FilePattern, ConcatDim
from pangeo_forge_recipes.recipes import HDFReferenceRecipe
from pangeo_forge_recipes.storage import (
    CacheFSSpecTarget,
    FSSpecTarget,
    MetadataTarget,
    StorageConfig,
)

In [3]:
# Chemin vers les données ERASTAR
fs = fsspec.filesystem('file')
data_root = Path("/home/datawork-cersat-public/provider/woc/products/theme1/ocean_winds/woc-l4-se-erastar-h/v2.0/")

## Découverte des jeux de données

In [4]:
# Année Max et Min du jeu de donnée ERA*
erastar_folder = fs.ls(data_root)
years = sorted([int(element.split('/')[-1]) for element in erastar_folder if fs.isdir(element)])
min_year, max_year = min(years), max(years)
display(min_year, max_year)

2009

2020

In [5]:
# Jour min de l'année min
min_year_folder = fs.ls(data_root / str(min_year))
days = sorted([int(element.split('/')[-1]) for element in min_year_folder if fs.isdir(element)])
min_day = days[0]
display(min_year, min_day)

2009

365

In [6]:
# Jour max de l'année max
max_year_folder = fs.ls(data_root / str(max_year))
days = sorted([int(element.split('/')[-1]) for element in max_year_folder if fs.isdir(element)])
max_day = days[-1]
display(max_year, max_day)

2020

366

les données vont du 31/12/2009 au 31/12/2020

In [7]:
files = fs.glob(f"{data_root}/2020/00[1-9]/*.nc")
[os.path.basename(file) for file in files][:10]

['2020010100-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T06_18-v2.0-fv1.0.nc',
 '2020010100-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_06-v2.0-fv1.0.nc',
 '2020010101-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_07-v2.0-fv1.0.nc',
 '2020010102-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_08-v2.0-fv1.0.nc',
 '2020010103-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_09-v2.0-fv1.0.nc',
 '2020010104-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_10-v2.0-fv1.0.nc',
 '2020010105-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_11-v2.0-fv1.0.nc',
 '2020010106-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_12-v2.0-fv1.0.nc',
 '2020010107-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_13-v2.0-fv1.0.nc',
 '2020010108-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T18_14-v2.0-fv1.0.nc']

On peux voir que le chemin consiste:
<ul>
    <li style='color:blue;'>une racine constante</li>
    <li style='color:red;'>un dossier pour l'année (4-digit)</li>
    <li style='color:orange;'>un dossier pour le jour de l'année (3-digit)</li>
    <li style='color:green;'>un nom de fichier horaire finissant par .nc</li>
</ul>

par exemple : <p><span style="color: blue;">*/home/datawork-cersat-public/provider/woc/products/theme1/ocean_winds/woc-l4-se-erastar-h/v2.0/</span><span style="color: red;">2020/</span><span style="color: orange;">001/</span><span style="color: green;">2020010100-WOC-L4-STRESS_ERAstar_GLO_0125_1H_R20191231T06_18-v2.0-fv1.0.nc*</span></p>

In [8]:
# Etude de la taille des chunks natifs des fichiers NetCDF
import netCDF4
nc = netCDF4.Dataset(files[0], 'r')
for var_name, var in nc.variables.items():
    print(f"Variable: {var_name}, Taille des chunks: {var.chunking()}")

Variable: lat, Taille des chunks: contiguous
Variable: lon, Taille des chunks: contiguous
Variable: time, Taille des chunks: contiguous
Variable: es_u10s, Taille des chunks: contiguous
Variable: es_v10s, Taille des chunks: contiguous
Variable: es_tauu, Taille des chunks: contiguous
Variable: es_tauv, Taille des chunks: contiguous
Variable: e5_u10s, Taille des chunks: contiguous
Variable: e5_v10s, Taille des chunks: contiguous
Variable: e5_tauu, Taille des chunks: contiguous
Variable: e5_tauv, Taille des chunks: contiguous
Variable: count, Taille des chunks: contiguous
Variable: quality_flag, Taille des chunks: contiguous


## Définition des fonctions

In [11]:
def create_recipe(name, dates, path_format, output):
    
    def make_path(time):
        return glob(path_format.format(time=time))[0]

    time_concat_dim = ConcatDim("time", dates)
    pattern = FilePattern(make_path, time_concat_dim)
    recipe = HDFReferenceRecipe(pattern)
    
    # Création des dossiers
    target_path = output / 'references' / name 
    metadata_path = output / 'metadata' / name
    cache_path = output / 'cache' / name
    
    fs, _, _ = fsspec.get_fs_token_paths(target_path)
    fs.mkdirs(target_path, exist_ok=True)
    target = FSSpecTarget(fs=fs, root_path=target_path)


    fs, _, _ = fsspec.get_fs_token_paths(metadata_path)
    if fs.exists(metadata_path):
        fs.rm(metadata_path, recursive=True)
    fs.mkdirs(metadata_path, exist_ok=True)
    metadata = MetadataTarget(fs=fs, root_path=metadata_path)


    fs, _, _ = fsspec.get_fs_token_paths(cache_path)
    if fs.exists(cache_path):
        fs.rm(cache_path, recursive=True)
    fs.mkdirs(cache_path, exist_ok=True)
    cache = CacheFSSpecTarget(fs=fs, root_path=cache_path)

    recipe.storage_config = StorageConfig(target=target, cache=cache, metadata=metadata)
    
    return recipe

In [12]:
def execute_recipe(recipe):
    task = recipe.to_dask()
    task.compute()

## Création de la recipe

In [61]:
name = 'erastar'
path_format = (
    "/home/datawork-cersat-public/provider/woc/products/theme1/ocean_winds/woc-l4-se-erastar-h/v2.0/"
    "{time.year:04d}/{time.dayofyear:03d}/{time.year:04d}{time.month:02d}{time.day:02d}{time.hour:02d}*.nc"
)

wrk = Path('/home1/scratch/gcaer/data')
version = 'datacube-year'
output = wrk / name / version

In [62]:
start, end = "2017-12-29 00:00:00", "2018-01-03 00:00:00"
#start, end = "2009-12-31 09:00:00", "2020-12-31 23:00:00"
dates = pd.date_range(start, end, freq="h")
groups = dates.groupby(dates.year)

In [63]:
recipes = dict()
for year, v in groups.items():
    output_year = output / str(year)
    recipes[year] = create_recipe(name, v, path_format, output_year)

## Exécuter la recipe

In [53]:
from distributed import Client
client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 48364 instead


In [64]:
%%time
for year in recipes.keys():
    print(year)
    execute_recipe(recipes[year])

2017
2018
CPU times: user 580 ms, sys: 156 ms, total: 736 ms
Wall time: 4.48 s


## Création du catalogue

In [65]:
def set_catalog(groups, output, name):
    import yaml
    
    # Création du catalogue intake
    sources = dict()
    for year in groups.keys():
        sources[str(year)] = {
            'args': {
                'chunks': {},
                'consolidated': False,
                'storage_options': {
                    'fo': f'{output.as_posix()}/{year}/references/{name}/reference.json',
                    'remote_options': {},
                    'remote_protocol': 'file',
                    'skip_instance_cache': True,
                    'target_options': {},
                    'target_protocol': 'file'
                },
                'urlpath': 'reference://'
            },
            'description': '',
            'driver': 'intake_xarray.xzarr.ZarrSource'
        }

    config = {
        'sources':sources
    }
    # Convertir les données en format YAML
    yaml_data = yaml.dump(config)

    # Écrire les données YAML dans un fichier
    with open(output / 'reference.yaml', 'w') as fichier_yaml:
        fichier_yaml.write(yaml_data)

In [66]:
set_catalog(groups, output, name)

## Ouvrir le datacube

In [67]:
catalog = output / "reference.yaml"
cat = intake.open_catalog(catalog)

In [68]:
%%time
_drop = ["es_u10s", "es_v10s", "e5_u10s", "e5_v10s", "count", "quality_flag"]
datacubes = []
for year in range(pd.to_datetime(start).year, pd.to_datetime(end).year):
    datacube = cat[f'{year}'](chunks={"time": 1, "latitude": -1, "longitude": -1}).to_dask().drop_vars(_drop)
    datacubes.append(datacube)

CPU times: user 40 ms, sys: 4 ms, total: 44 ms
Wall time: 178 ms




In [69]:
%%time
datacube = xr.concat(datacubes, dim="time")

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 26.4 ms


In [71]:
datacube.isel(time=slice(0, 20))

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 316.41 MiB 15.82 MiB Shape (20, 1440, 2880) (1, 1440, 2880) Dask graph 20 chunks in 3 graph layers Data type float32 numpy.ndarray",2880  1440  20,

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 316.41 MiB 15.82 MiB Shape (20, 1440, 2880) (1, 1440, 2880) Dask graph 20 chunks in 3 graph layers Data type float32 numpy.ndarray",2880  1440  20,

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 316.41 MiB 15.82 MiB Shape (20, 1440, 2880) (1, 1440, 2880) Dask graph 20 chunks in 3 graph layers Data type float32 numpy.ndarray",2880  1440  20,

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 316.41 MiB 15.82 MiB Shape (20, 1440, 2880) (1, 1440, 2880) Dask graph 20 chunks in 3 graph layers Data type float32 numpy.ndarray",2880  1440  20,

Unnamed: 0,Array,Chunk
Bytes,316.41 MiB,15.82 MiB
Shape,"(20, 1440, 2880)","(1, 1440, 2880)"
Dask graph,20 chunks in 3 graph layers,20 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
