# Create JSONS with fsspec ReferenceMaker
## 24 hours of GOES data
Needed:
- fsspec-reference-maker
    - `pip install git+https://github.com/intake/fsspec-reference-maker`
- adlfs >= 0.7.7
    - `pip install --upgrade adlfs>=0.7.7`

In [1]:
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr

In [2]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import zipfile
import logging
import fsspec
import ujson
from tqdm import tqdm
from glob import glob
import os

from azure.storage.blob import ContainerClient
import tempfile

import dask

In [3]:
from dask.distributed import Client
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:33719  Dashboard: /proxy/8787/status,Cluster  Workers: 4  Cores: 4  Memory: 32.00 GiB


## Get urls

In [4]:
tempdir = os.path.join(tempfile.gettempdir(), 'goes')
os.makedirs(tempdir, exist_ok=True)

product = 'ABI-L2-MCMIPF'
syear = '2020'; sday='002'; shour = '14'

storage_account_url = 'https://goes.blob.core.windows.net'
container_name = 'noaa-goes16'
goes_blob_root = storage_account_url + '/' + container_name + '/'

goes_container_client = ContainerClient(account_url=storage_account_url, container_name=container_name, credential=None)

def download_url(url):
    url_as_filename = url.replace('://', '_').replace('/','_')
    destination_filename = os.path.join(tempdir, url_as_filename)
    urllib.request.urlretrieve(url, destination_filename)
    return destination_filename

prefix = product + '/' + syear + '/' + sday + '/' 
print('Finding blobs matching prefex: {}'.format(prefix))
generator = goes_container_client.list_blobs(name_starts_with=prefix)
blobs = []
for blob in generator:
    blobs.append(blob.name)

Finding blobs matching prefex: ABI-L2-MCMIPF/2020/002/


In [5]:
urllist = ['az://' + container_name + '/' + u  for u in blobs]

In [6]:
urllist[0]

'az://noaa-goes16/ABI-L2-MCMIPF/2020/002/00/OR_ABI-L2-MCMIPF-M6_G16_s20200020000216_e20200020009524_c20200020010031.nc'

### Generate json function

In [8]:
def gen_json(u):
    so = dict(
        mode="rb", anon=True, default_fill_cache=False, default_cache_type="none"
    )
    with fsspec.open(u, **so, account_name='goeseuwest') as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())


### Use dask to make jsons

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urllist]);

## MultiZarr

In [4]:
json_list = sorted(glob("jsons/*.json"))

In [5]:
mzz = MultiZarrToZarr(
    json_list,
    remote_protocol='az',
    remote_options={
       'account_name' : 'goeseuwest'
    },    
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'decode_coords' : False,
    },
    xarray_concat_args={
        "data_vars": "minimal",
        "coords": "minimal",
        "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "t"

    }
)

In [6]:
%%time
%%prun -D multizarr_profile 
mzz.translate('combined.json')

 
*** Profile stats marshalled to file 'multizarr_profile'. 
CPU times: user 38min 51s, sys: 47.8 s, total: 39min 39s
Wall time: 39min 21s


         4340663486 function calls (4303816230 primitive calls) in 2360.914 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
391721185  600.247    0.000  967.300    0.000 posixpath.py:100(split)
 10646640  261.028    0.000  486.887    0.000 highlevelgraph.py:665(__getitem__)
685538640  238.919    0.000  238.919    0.000 highlevelgraph.py:489(__getitem__)
        1  186.170  186.170 2133.609 2133.609 combine.py:123(_build_output)
391721274  106.738    0.000  153.099    0.000 posixpath.py:41(_get_sep)
391721743   76.159    0.000   76.159    0.000 {method 'rfind' of 'str' objects}
        1   69.013   69.013  125.982  125.982 optimization.py:428(fuse)
553235032/553194888   67.871    0.000   77.799    0.000 {built-in method builtins.isinstance}
10631952/2659248   61.337    0.000  106.789    0.000 optimization.py:237(fuse_slice)
420048572   58.875    0.000   58.875    0.000 {method 'rstrip' of 'str' objects}
453586295/453557863  

In [17]:
client.shutdown()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


***
## Processing times:
|Action | Time | Note |
|-------:|:------| :---|
|Make individual jsons | 26min 39s | 4 workers, faster times can be achieved with more dask workers |
|Make combined json | 55min 6s | don't think this can be sped up w/ dask |
|Make combined v2 | 45min 52s | Actual CPU time is 25min, might try `az://` instead of `abfs`|
| __Total__ | __1h 21min__ | 