In [1]:
import cftime
import numpy as np
import pandas as pd
import xarray as xr
xr.set_options(keep_attrs=True)
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.feature import NaturalEarthFeature
from cartopy.util import add_cyclic_point
from cartopy.io.shapereader import Reader
from cartopy.feature import ShapelyFeature
import geopandas as gpd
import seaborn as sns
import shapefile as shp
from shapely.geometry import Polygon, MultiPolygon
import intake
import os.path
from tqdm import tqdm
import dask.array as da
import matplotlib.pyplot as plt
# import ultraplot as uplt
from matplotlib.ticker import FixedLocator
import matplotlib.path as mplPath
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

from dask.distributed import Client
import dask.config
dask.config.set({"array.slicing.split_large_chunks": False})

<dask.config.set at 0x15452f8a3d10>

In [2]:
def regrid_data(fromthis, tothis, method=1):
    """Regrid data using various different methods"""

    #Import necessary modules:
    import xarray as xr

    if method == 1:
        # kludgy: spatial regridding only, seems like can't automatically deal with time
        if 'time' in fromthis.coords:
            result = [fromthis.isel(time=t).interp_like(tothis) for t,time in enumerate(fromthis['time'])]
            result = xr.concat(result, 'time')
            return result
        else:
            return fromthis.interp_like(tothis)
    elif method == 2:
        newlat = tothis['lat']
        newlon = tothis['lon']
        coords = dict(fromthis.coords)
        coords['lat'] = newlat
        coords['lon'] = newlon
        return fromthis.interp(coords)
    elif method == 3:
        newlat = tothis['lat']
        newlon = tothis['lon']
        ds_out = xr.Dataset({'lat': newlat, 'lon': newlon})
        regridder = xe.Regridder(fromthis, ds_out, 'bilinear')
        return regridder(fromthis)
    elif method==4:
        # geocat
        newlat = tothis['lat']
        newlon = tothis['lon']
        result = geocat.comp.linint2(fromthis, newlon, newlat, False)
        result.name = fromthis.name
        
        return result

def update_leads(x):
    """Converts from time coordinates to lead coordinates.
    I.e., lead days.
    """
    x = x.rename({"time": "lead"})
    x = x.assign_coords(lead=np.arange(x.lead.size).astype(int))
    
    return x

def drop_duplicates_along_all_dims(obj, keep=False):
    deduplicated = obj
    for dim in obj.dims:
        indexes = {dim: ~deduplicated.get_index(dim).duplicated(keep=keep)}
        deduplicated = deduplicated.isel(indexes)
    return deduplicated

In [4]:
# import dask
# dask.config.set({'logging.distributed': 'error'})

# from dask_jobqueue import PBSCluster

# # For Casper
# cluster = PBSCluster(
#     queue="casper",
#     walltime="02:00:00",
#     account="P93300042",
#     memory="8GB",
#     # resource_spec="select=1:ncpus=1:mem=4GB",
#     resource_spec="select=1:ncpus=1:mem=8GB",
#     cores=1,
#     processes=1,
# )

# from dask.distributed import Client

# # Connect client to the remote dask workers
# client = Client(cluster)
# print(client)

# cluster.scale(16)

# client.wait_for_workers(16)


<Client: 'tcp://128.117.208.175:40923' processes=0 threads=0, memory=0 B>


In [4]:
from dask_jobqueue import PBSCluster

cluster = PBSCluster(
    project='P93300041',
    cores = 1,
    memory = '100GiB',
    processes = 1,
    local_directory = '/glade/derecho/scratch/mdfowler/dask/pbs.$PBS_JOBID/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=100GB',
    queue = 'casper',
    walltime = '01:00:00',
    interface = 'ext'
)
cluster.scale(jobs=50)
client = Client(cluster)

## Get Data

In [5]:
%%time 
raw  = xr.open_zarr("/glade/derecho/scratch/mdfowler/temp/S2S_zarr/cesm2cam6v2.raw.daily.SOILWATER10CM.geospatial.zarr",consolidated=True)
clim = xr.open_zarr("/glade/derecho/scratch/mdfowler/temp/S2S_zarr/cesm2cam6v2.climatology.daily.SOILWATER10CM.geospatial.zarr",consolidated=True) 
anom = xr.open_zarr("/glade/derecho/scratch/mdfowler/temp/S2S_zarr/cesm2cam6v2.anom.daily.SOILWATER10CM.geospatial.zarr",consolidated=True)



CPU times: user 614 ms, sys: 142 ms, total: 756 ms
Wall time: 1.15 s


In [4]:
reg = ['US_HUC01', 'US_HUC02', 'US_HUC03', 'US_HUC04', 'US_HUC05', 'US_HUC06', 'US_HUC07', 'US_HUC08',
               'US_HUC09', 'US_HUC10', 'US_HUC11', 'US_HUC12', 'US_HUC13', 'US_HUC14', 'US_HUC15', 'US_HUC16', 
               'US_HUC17', 'US_HUC18', 'US_HUC19']

season = "JJA" # JJA, DJF, ann
domain = "namerica" #can be global or namerica


In [5]:

soilm = xr.open_dataset("catDS_SMterciles_basins_quant-aft-regavg.nc")
mask_neu = soilm["soilTercileCategory"].sel(region=reg) == "neutral"; neu = soilm.sel(region=reg).where(mask_neu, drop=True)
mask_dry = soilm["soilTercileCategory"].sel(region=reg) == "dry";     dry = soilm.sel(region=reg).where(mask_dry, drop=True)
mask_wet = soilm["soilTercileCategory"].sel(region=reg) == "wet";     wet = soilm.sel(region=reg).where(mask_wet, drop=True)

neu["init"] = [cftime.DatetimeProlepticGregorian(pd.DatetimeIndex([d]).year[0], \
                                                 pd.DatetimeIndex([d]).month[0], \
                                                 pd.DatetimeIndex([d]).day[0]) for d in neu.init.values]
dry["init"] = [cftime.DatetimeProlepticGregorian(pd.DatetimeIndex([d]).year[0], \
                                                 pd.DatetimeIndex([d]).month[0], \
                                                 pd.DatetimeIndex([d]).day[0]) for d in dry.init.values]
wet["init"] = [cftime.DatetimeProlepticGregorian(pd.DatetimeIndex([d]).year[0], \
                                                 pd.DatetimeIndex([d]).month[0], \
                                                 pd.DatetimeIndex([d]).day[0]) for d in wet.init.values]


## Get obs

In [25]:
obs_SM      = xr.open_dataset('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg.nc')
# obs_SM_clim = xr.open_dataset('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg_CLIM.nc')
obs_SM_anom = xr.open_dataset('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg_ANOM.nc')

In [6]:
# path = '/glade/work/yanand/USDA_FACT/ERA5_land/ERA5_land_hourly_global_1999_2021/ERA5_SM_0_5m_0_45_leadday_1999_2021_global_organize_NEW.nc'
# obs_SM = xr.open_dataset(path)

# path2 = '/glade/work/yanand/USDA_FACT/ERA5_land/ERA5_land_hourly_global_1999_2021/ERA52CAM6_SM_daily_1999_2021_0_5m_global_NEW.nc'
# obs_SM2 = xr.open_dataset(path2)


In [7]:
# startDate = '1999-01-01'
# endDate   = '2021-12-31'
# dates_era5 = pd.date_range(start=startDate, end=endDate)

In [8]:
# obs_SM2['time'] = dates_era5

In [9]:
# ## Reorganize ERA5 SM to be same dimensions as CESM2 soil moisture 
# obs_SM_reorg = np.full([len(raw.init), len(raw.lead), len(raw.lat), len(raw.lon)], np.nan)

# for iInit in range(len(raw.init)): 
#     matchDate = obs_SM2.sel(time=slice(raw.init.values[iInit], raw.init.values[iInit]+np.timedelta64(45,'D'))).swvRZ0.values
#     obs_SM_reorg[iInit,0:np.shape(matchDate)[0],:,:] = matchDate


In [9]:
# cat_DS = xr.Dataset(
#     data_vars = dict( 
#         ERA5_SM_0to50cm  = (['init','lead','lat','lon'], obs_SM_reorg),
#     ), 
#     coords = dict(
#         init=raw.init.values,
#         lead=np.arange(46),
#         lat=raw.lat.values,
#         lon=raw.lon.values,
#     )
# )

In [10]:
#  cat_DS.to_netcdf('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg.nc')



**How do I get anomaly??**

In [11]:
# ds = obs_SM.chunk({"init": -1, "lead": 1, "lat": "auto", "lon": "auto"}).persist()

In [12]:
# ds

Unnamed: 0,Array,Chunk
Bytes,22.14 GiB,126.19 MiB
Shape,"(1168, 46, 192, 288)","(1168, 1, 119, 119)"
Dask graph,276 chunks in 1 graph layer,276 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 22.14 GiB 126.19 MiB Shape (1168, 46, 192, 288) (1168, 1, 119, 119) Dask graph 276 chunks in 1 graph layer Data type float64 numpy.ndarray",1168  1  288  192  46,

Unnamed: 0,Array,Chunk
Bytes,22.14 GiB,126.19 MiB
Shape,"(1168, 46, 192, 288)","(1168, 1, 119, 119)"
Dask graph,276 chunks in 1 graph layer,276 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [13]:
# var = ['ERA5_SM_0to50cm']
# da_day_clim_smooth_vars = []

# for i in var:
#     print(i)
#     da_day_clim = ds[i].groupby("init.dayofyear").mean("init")
#     days = da_day_clim.shape[0]
#     # Rechunk to make dayofyear climatology one chunk.
#     da_day_clim = da_day_clim.chunk({"dayofyear": days}).persist()
#     # Just mimicking the chunk sizes from our climatology.
#     x = da.full((days, da_day_clim.lead.size, da_day_clim.lat.size, da_day_clim.lon.size),
#         np.nan,dtype="float32",chunks=(days, 1, 181, 360),)
#     # Pad the daily climatolgy with nans
#     _da = xr.DataArray(x,dims=["dayofyear", "lead", "lat", "lon"],
#         coords=[da_day_clim.dayofyear, da_day_clim.lead, da_day_clim.lat, da_day_clim.lon],name=i,)
#     # Pad the daily climatolgy with nans
#     da_day_clim_wnan = da_day_clim.combine_first(_da)
#     # Period rolling twice to make it triangular smoothing
#     da_day_clim_smooth = da_day_clim_wnan.copy()
    
#     for j in range(2):
#         # Extand the DataArray to allow rolling to do periodic
#         da_day_clim_smooth = xr.concat([da_day_clim_smooth[-15:], da_day_clim_smooth, da_day_clim_smooth[:15]],"dayofyear",)
#         # Rolling mean
#         da_day_clim_smooth = da_day_clim_smooth.rolling(dayofyear=31, center=True, min_periods=1).mean()
#         # Drop the periodic boundaries
#         da_day_clim_smooth = da_day_clim_smooth.isel(dayofyear=slice(15, -15))
        
#     # Extract the original days
#     da_day_clim_smooth = da_day_clim_smooth.sel(dayofyear=da_day_clim.dayofyear)
#     da_day_clim_smooth.name = i
    
#     da_day_clim_smooth_vars.append(da_day_clim_smooth)

ERA5_SM_0to50cm


In [16]:
# ds_day_clim_smooth_vars = xr.merge(da_day_clim_smooth_vars)

In [17]:
# ds_day_clim_smooth_vars = ds_day_clim_smooth_vars.chunk({"dayofyear": -1, "lead": 1, "lat": 192, "lon": 288}).persist()

In [18]:
# ds_day_clim_smooth_vars.to_netcdf('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg_CLIM.nc')


In [22]:
# ## Actual anomaly! 

# raw = obs_SM.chunk({"init": 1, "lead": -1, "lat": 192, "lon": 288}).persist()
# ds_anom = raw.groupby("init.dayofyear") - clim


In [24]:
ds_anom.to_netcdf('/glade/derecho/scratch/mdfowler/temp/ERA5_SM_0to50cm_fromSanjiv_reorg_ANOM.nc')

## How do we compare?!