# Create JSONS with fsspec ReferenceMaker
## 24 hours of GOES data
Needed:
- fsspec-reference-maker
    - `pip install git+https://github.com/intake/fsspec-reference-maker`
- adlfs >= 0.7.7
    - `pip install --upgrade adlfs>=0.7.7`

In [1]:
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr

In [2]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import zipfile
import logging
import fsspec
import ujson
from tqdm import tqdm
from glob import glob
import os

from azure.storage.blob import ContainerClient
import tempfile

import dask

## Dask Setup

In [3]:
from dask.distributed import Client
client = Client(n_workers=8)
client

0,1
Client  Scheduler: tcp://127.0.0.1:38723  Dashboard: /proxy/8787/status,Cluster  Workers: 8  Cores: 8  Memory: 32.00 GiB


## Get urls

In [4]:
fs = fsspec.filesystem('az', account_name='goeseuwest')
urllist = ['az://' + f for f in sorted(fs.glob('az://noaa-goes16/ABI-L2-MCMIPF/2020/002/*/*.nc'))]

## Generate json function

### Write to local folder

In [5]:
from pathlib import Path
Path("./jsons/").mkdir(exist_ok=True)

@dask.delayed
def gen_json(u):
    so = dict(
        mode="rb", anon=True, default_fill_cache=False, default_cache_type="none"
    )
    
    outpath = './jsons/' + u.split('/')[-1].split('.nc')[0] + '.json'
    
    with fsspec.open(u, **so, account_name='goeseuwest') as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        
        with open(outpath, 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())

### Write to Azure (requires active subscription)

In [6]:

# @dask.delayed
# def gen_json(u):

#     # Get Azure Connection String
#     connection_string = os.getenv('azure_connection_string')

#     so = dict(
#         mode="rb", anon=True, default_fill_cache=False, default_cache_type="none"
#     )
    
#     outpath = 'az://goes-netcdf/' + u.split('az://noaa-goes16/')[-1].split('.nc')[0] + '.json'
    
#     with fsspec.open(u, **so, account_name='goeseuwest') as inf:
#         h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        
        
#         with fsspec.open(outpath, 'wb', connection_string=connection_string) as outf:
#             outf.write(ujson.dumps(h5chunks.translate()).encode())


## Use dask to make jsons

In [7]:
out = [gen_json(u) for u in urllist]

In [8]:
%%time
dask.compute(*out);

CPU times: user 2min 22s, sys: 30 s, total: 2min 52s
Wall time: 41min 54s


(None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

# MultiZarr

### JSON files were written locally

In [9]:
flist = sorted(glob("./jsons/*.json"))

mzz = MultiZarrToZarr(
    flist,
    remote_protocol='az',
    remote_options={
       'account_name' : 'goeseuwest'
    },    
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'decode_coords' : False,
    },
    xarray_concat_args={
        "data_vars": "minimal",
        "coords": "minimal",
        "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "t"

    }
)

### JSON files were written to Azure above

In [10]:
# connection_string = os.getenv('azure_connection_string')
# fs = fsspec.filesystem('az', account_name='test4arco', connection_string=connection_string)
# flist = ['az://' + u for u in sorted(fs.glob('az://goes-netcdf/**/*.json'))]

# mzz = MultiZarrToZarr(
#     flist,
#     storage_options = { # This gives access to the JSON files on Azure
#         'connection_string' : connection_string
#     },
#     remote_protocol='az',
#     remote_options={
#        'account_name' : 'goeseuwest'
#     },    
#     xarray_open_kwargs={
#         'decode_cf' : False,
#         'mask_and_scale' : False,
#         'decode_times' : False,
#         'use_cftime' : False,
#         'decode_coords' : False,
#     },
#     xarray_concat_args={
#         "data_vars": "minimal",
#         "coords": "minimal",
#         "compat": "override",
#         "join": "override",
#         "combine_attrs": "override",
#         "dim": "t"
#     }
# )

In [11]:
%%time
mzz.translate('./combined.json')

CPU times: user 16min 16s, sys: 20.2 s, total: 16min 36s
Wall time: 16min 23s


In [12]:
client.shutdown()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


***
## Processing times:
|Action | Time | Note |
|-------:|:------| :---|
|Make individual jsons | 42 minutes | 8 workers |
|Make combined json | 16 minutes | |
| __Total__ | __58 minutes__ | |