# World_Pop Population 100m

Notebook environment to migrate csv files to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Import standard packages
import os
import pathlib
import sys

import numpy as np
#import geopandas as gpd
 
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import itertools
import json
import copy
from itertools import chain
from shapely import wkb

import dask.dataframe as dd
import tqdm
from tqdm import tqdm


# Import custom functionality
# sys.path.append('c:/Windows/System32/coclicodata/src')#CLENMAR ADDED

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets\02_Exposure"
#gca_data_dir = p_drive.joinpath("11209197-018-global-coastal-atlas","MSc_students","ClenmarRowe")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)


<IPython.core.display.Javascript object>

In [3]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"Population","worldpop")
dataset_dir_path = dataset_dir.joinpath("World_Pop_100m_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
CF_dir

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/All_Datasets/Orig_Datasets/02_Exposure/Population/worldpop/CF')

<IPython.core.display.Javascript object>

In [15]:

filename=dataset_dir.joinpath("World_Pop_2020_100m_UNadj_centroid_final.csv")
df_orig= dd.read_csv(filename, header=0)
df_orig.head()


Unnamed: 0,fid,pop_count,layer,area,perimeter,Tot_pop_count,xcoord,ycoord
0,1,0,_abw_ppp_2020_UNadj_constrained,1.388889e-06,0.005,0,-70.05375,12.615833
1,2,0,_abw_ppp_2020_UNadj_constrained,1.388889e-06,0.005,0,-70.05375,12.614167
2,3,2,_abw_ppp_2020_UNadj_constrained,2.083333e-06,0.006667,6,-70.053055,12.607222
3,4,4,_abw_ppp_2020_UNadj_constrained,6.944444e-07,0.003333,4,-70.0475,12.606667
4,5,0,_abw_ppp_2020_UNadj_constrained,1.388889e-06,0.005,0,-70.04625,12.606667


<IPython.core.display.Javascript object>

In [10]:

df=df_orig.drop(["pop_count","layer","area","perimeter","Tot_pop_count"], axis=1)

df["Tot_pop_count"]=df_orig["Tot_pop_count"]
df=df.rename({"xcoord":"lon", "ycoord":"lat"})
df=df.drop(["fid"], axis=1)
df


Unnamed: 0_level_0,xcoord,ycoord,Tot_pop_count
npartitions=339,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


<IPython.core.display.Javascript object>

In [14]:
df.npartitions

339

<IPython.core.display.Javascript object>

In [17]:
# # save each dask dataset partition as a different netcdf file

# for part in tqdm(range(df.npartitions),desc="parts completed"):
    
#     df_part=df.partitions[part]
    
#     if part>0:
#         df_part_prev= df.partitions[part-1]
#         prev_max= df_part_prev["index"].values.max
#         df_part["index"]= df_part["index"].values + prev_max


#     ds = xr.Dataset.from_dataframe( df_part)
#     ds.to_netcdf(str(dataset_dir.joinpath("netcdf_parts","World_Pop_100m"+str(part)+".nc")))
# ds


parts completed: 100%|██████████| 339/339 [37:38<00:00,  6.66s/it]


<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [20]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir.joinpath("netcdf_parts","World_Pop_100m"+str(part)+".nc"), 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [21]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [40]:
# Get a list of all netCDF files in the specified directory
files = sorted(glob.glob(os.path.join(dataset_dir ,"netcdf_parts",  "*.nc")))

# Open datasets using lazily in Chunks
datasets = [xr.open_dataset(f, chunks={'index':  10000}) for f in files]
datasets

[<xarray.Dataset>
 Dimensions:        (index: 528878)
 Coordinates:
   * index          (index) int64 0 1 2 3 4 ... 528874 528875 528876 528877
 Data variables:
     xcoord         (index) float64 dask.array<chunksize=(10000,), meta=np.ndarray>
     ycoord         (index) float64 dask.array<chunksize=(10000,), meta=np.ndarray>
     Tot_pop_count  (index) int64 dask.array<chunksize=(10000,), meta=np.ndarray>,
 <xarray.Dataset>
 Dimensions:        (index: 531421)
 Coordinates:
   * index          (index) int64 0 1 2 3 4 ... 531417 531418 531419 531420
 Data variables:
     xcoord         (index) float64 dask.array<chunksize=(10000,), meta=np.ndarray>
     ycoord         (index) float64 dask.array<chunksize=(10000,), meta=np.ndarray>
     Tot_pop_count  (index) int64 dask.array<chunksize=(10000,), meta=np.ndarray>,
 <xarray.Dataset>
 Dimensions:        (index: 531835)
 Coordinates:
   * index          (index) int64 0 1 2 3 4 ... 531831 531832 531833 531834
 Data variables:
     xcoord    

<IPython.core.display.Javascript object>

In [42]:
datasets[0]


Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.04 MiB 78.12 kiB Shape (528878,) (10000,) Dask graph 53 chunks in 2 graph layers Data type float64 numpy.ndarray",528878  1,

Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.04 MiB 78.12 kiB Shape (528878,) (10000,) Dask graph 53 chunks in 2 graph layers Data type float64 numpy.ndarray",528878  1,

Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.04 MiB 78.12 kiB Shape (528878,) (10000,) Dask graph 53 chunks in 2 graph layers Data type int64 numpy.ndarray",528878  1,

Unnamed: 0,Array,Chunk
Bytes,4.04 MiB,78.12 kiB
Shape,"(528878,)","(10000,)"
Dask graph,53 chunks in 2 graph layers,53 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [43]:
datasets2=[]
for part in tqdm(range(df.npartitions),desc="parts completed"):
    
    ds_part=datasets[part]
    
    if part>0:
        ds_part_prev= datasets[part-1]
        prev_max= ds_part_prev["index"].values.max()
        ds_part["index"]= ds_part["index"].values + prev_max +1

    datasets2.append(ds_part)




parts completed: 100%|██████████| 339/339 [00:02<00:00, 129.24it/s]


<IPython.core.display.Javascript object>

In [44]:
datasets[2]

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type float64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type float64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type int64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [45]:
datasets2[2]

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type float64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type float64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.06 MiB 78.12 kiB Shape (531835,) (10000,) Dask graph 54 chunks in 2 graph layers Data type int64 numpy.ndarray",531835  1,

Unnamed: 0,Array,Chunk
Bytes,4.06 MiB,78.12 kiB
Shape,"(531835,)","(10000,)"
Dask graph,54 chunks in 2 graph layers,54 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [46]:
# Concatenate the datasets along the index dimension
ds = xr.concat(datasets2, dim='index')#.compute()

# Save the merged dataset to a new netCDF file in the current directory

ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [47]:
import json

# NetCDF attribute alterations by means of metadata template
metadata_json=dataset_dir.joinpath("World_Pop_100m_metadata.json")

f_global = open(metadata_json)
meta_global = json.load(f_global)


for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [48]:
ds = ds.rename_dims({"index": "nstations"})

# rename variables, if necessary
ds = ds.rename_vars(
        {"xcoord": "lon", "ycoord": "lat","index":"stations"} #"index":"nstations", 
  )

# ds=ds.drop(["index"])
# set some data variables to coordinates to avoid duplication of dimensions in later stage

ds = ds.set_coords(["lon", "lat"])
# for var in ds.variables:
#     ds = ds.set_coords(var)

ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [49]:
# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
#    "Scenario": {"long_name": "RCP Scenario", "units": "1"}, # set to 1 if no unit
#    "Time_Horizon": {"long_name": "time", "units": "yr"}, # set to 1 if no unit
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "stations": {"long_name": "stations", "units": "1"}, # set to 1 if no unit
    "Tot_pop_count": { "long_name": "Population count UN Adjusted Constrained", "units": "1"},
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue
ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 18106 chunks in 679 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,18106 chunks in 679 graph layers,18106 chunks in 679 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [5]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


<IPython.core.display.Javascript object>

In [None]:


ds.to_netcdf(path=dataset_dir_path_CF)

### Check CF compliancy modified NetCDF files

In [6]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path_CF,chunks={'nstations':  10000})

# check original dataset
ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 17925 chunks in 2 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 17925 chunks in 2 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 17925 chunks in 2 graph layers Data type float64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 78.12 kiB Shape (179243956,) (10000,) Dask graph 17925 chunks in 2 graph layers Data type int64 numpy.ndarray",179243956  1,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,78.12 kiB
Shape,"(179243956,)","(10000,)"
Dask graph,17925 chunks in 2 graph layers,17925 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


<IPython.core.display.Javascript object>

In [7]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [8]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

In [9]:

from dask.diagnostics import ProgressBar


# # export to zarr in write mode (to overwrite if exists)

with ProgressBar():
    ds.to_zarr(str(dataset_dir_path).replace("original","final").replace(".nc", ".zarr"), mode="w")

[########################################] | 100% Completed | 23m 15s


<IPython.core.display.Javascript object>