# NB7 naturalized flow analysis over PNW

- check data availability
- basic flow time series check (long term seasonality and daily series)

In [None]:
%matplotlib inline  
import os, sys
import numpy as np
import xarray as xr
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from timeit import default_timer as timer
import cartopy.crs as ccrs

import scripts.colors as ccmap
from scripts.utility import AutoVivification
import scripts.metrics as metrics
from scripts.utility import base_map

print("\nThe Python version: %s.%s.%s" % sys.version_info[:3])
print(xr.__name__, xr.__version__)

In [None]:
def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

In [None]:
def flow_start_length(dr: xr.DataArray,  dayofyear='wateryear'):
    """
    Calculates day of year when valid data start and length of data.
    Arguments
    ---------
    dr: xr.DataArray
        2D DataArray containing daily time series with coordinates of 'site', and 'time'
    Returns
    -------
    ds_ann_max: xr.Dataset
        Dataset containing two 2D DataArrays 'flow_len' and 'flow_start' with coordinate of 'year', and 'site'
    Notes
    -------
    dayofyear start with October 1st with dayofyear="wateryear" or January 1st with dayofyear="calendar".
    """
    
    dayofyear='wateryear'
    if dayofyear=='wateryear':
        smon=10; sday=1; emon=9; eday=30; yr_adj=1
    elif dayofyear=='calendar':
        smon=1; sday=1; emon=12; eday=31; yr_adj=0
    else:
        raise ValueError('Invalid argument for "dayofyear"')

    years = np.unique(dr.time.dt.year.values)[:-1]

    ds_flow_data = xr.Dataset(data_vars=dict(
                    flow_len   =(["year", "site"], np.full((len(years),len(dr['site'])), np.nan, dtype='float32')),
                    flow_start =(["year", "site"], np.full((len(years),len(dr['site'])), np.nan, dtype='float32')),
                    ),
                    coords=dict(year=years,
                                site=dr['site'],),
                    )

    t_axis = dr.dims.index('time')

    for yr in years:
        time_slice=slice(f'{yr}-{smon}-{sday}',f'{yr+yr_adj}-{emon}-{eday}')
        data_array = dr.sel(time=time_slice).values
        for sidx, site in enumerate(dr['site'].values):
            binary_array = np.where(~np.isnan(data_array[:,sidx]), 1, 0)
            count_dups = metrics.myCount(binary_array)
            if not count_dups:
                ds_flow_data['flow_len'].loc[yr, site] = 0
                ds_flow_data['flow_start'].loc[yr, site] = 0
            else:
                ds_flow_data['flow_len'].loc[yr, site] = np.sum(count_dups)
                ds_flow_data['flow_start'].loc[yr, site] = np.where(binary_array==1)[0][0]+1 # used to np.sum
    return ds_flow_data

## 1. Setup

In [None]:
# directories
main_path  = '/glade/campaign/ral/hap/mizukami/archive/pnw_hydrology/final_archive_v1' # !!! This is top directory of the dataset.
geo_path   = os.path.join(main_path, 'ancillary_data','geospatial_data')
nrni_path  = os.path.join(main_path, 'ancillary_data')
figure_path = 'MB7_figures'
os.makedirs(figure_path, exist_ok=True)
os.makedirs(os.path.join(figure_path, 'per_site'), exist_ok=True)

## 2.Load data 

### 2.1 geopackage data

In [None]:
df_huc12 = gpd.read_file(os.path.join(geo_path, 'HUC12_MERIT_PNW.gpkg'))
df_huc12['geometry'] = df_huc12.geometry.simplify(0.05) # simplified
df_site  = gpd.read_file(os.path.join(geo_path, 'PNW_flow_site.gpkg'))
df_site = df_site[df_site['removed']==0]
df_site = df_site.set_index('location_name')

### 2.1 Read naturalized flow data

In [None]:
ds_nrni = xr.open_dataset(os.path.join(nrni_path,'PNW_unimpaired_flow_1951-2018_latlon.nc'))
nrni_site = ds_nrni.site.values
print('Number of flow sites: %d'%len(nrni_site))

## 3. availability of data record

In [None]:
sites = df_site.index.values 
for loc_name in ds_nrni['site'].values: # there are 331 sites
    if loc_name not in sites:
        print(loc_name)

In [None]:
vals4=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70] 
cmap4 = plt.get_cmap('plasma_r', (16))
cmap4.set_over('xkcd:dark blue')
cmap4.set_under('xkcd:light yellow')
norm4 = mpl.colors.BoundaryNorm(vals4, cmap4.N)

vals6=[1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010] 
cmap6 = plt.get_cmap('plasma', (14))
cmap6.set_over('xkcd:dark blue')
cmap6.set_under('xkcd:light yellow')
norm6 = mpl.colors.BoundaryNorm(vals6, cmap6.N)

In [None]:
impaired_flow=ds_nrni['streamflow']
#count = impaired_flow.where(~np.isnan(impaired_flow)).groupby('time.year').count(dim='time')
#nyr_data = count.where(count>364).count(dim='year')
ds_flow_len = flow_start_length(impaired_flow)

years = ds_flow_len.year.values
ds1 = xr.Dataset(data_vars=dict(
                flow_len_yr   =(["site"], np.full(len(ds_flow_len['site']), np.nan, dtype='int')),
                flow_start_yr =(["site"], np.full(len(ds_flow_len['site']), np.nan, dtype='int')),
                ),
                coords=dict(site=ds_flow_len['site'],),
                )
for sidx, site in enumerate(ds_flow_len['site'].values):
    array1 = ds_flow_len['flow_len'].sel(site=site)
    binary_array = np.where(array1>364, 1, 0)
    #count_dups = myCount(binary_array)

    valid_yr = np.where(binary_array==1)[0]
    valid_yr_lists = consecutive(valid_yr)
    length_yr=0
    for valid_yr_list in valid_yr_lists:
        if len(valid_yr_list)>length_yr:
            length_yr=len(valid_yr_list)
            first_yr= years[valid_yr_list[0]]
    
    if length_yr==0:
        ds1['flow_len_yr'].loc[site] = 0
        ds1['flow_start_yr'].loc[site] = 0
    else:
        #ix = np.argmax(count_dups)
        #first_yr = years[np.where(binary_array==1)[0][0]]
        ds1['flow_len_yr'].loc[site] = length_yr #count_dups[ix]
        ds1['flow_start_yr'].loc[site] = first_yr# used to np.sum

### 3.2. Map of number of years with valid daily data 

In [None]:
df_nyr_data = ds1.to_dataframe()
df_nyr_data.index.rename('location_name',inplace=True)
df_final = df_site.merge(df_nyr_data, on="location_name", how = 'inner')

fig, ax1 = plt.subplots(nrows=1, ncols=2, figsize=(9.5, 4), subplot_kw={"projection": ccrs.PlateCarree()}, dpi=100,)
fig.subplots_adjust(left=0.025, bottom=0.025, right=0.965, top=0.95, wspace=0.10, hspace=0.125)

base_map(ax1[0], df_huc12)
df_final.plot(ax=ax1[0], column='flow_start_yr', markersize=15, cmap=cmap6, norm=norm6, legend=True, legend_kwds={'extend':'neither', 'pad':0.02});
ax1[0].set_extent([-125, -110, 41.5, 52.5])
ax1[0].set_title('start year of consecutive valid daily data', fontsize=9)

base_map(ax1[1], df_huc12)
df_final.plot(ax=ax1[1], column='flow_len_yr', markersize=15, cmap=cmap4, norm=norm4, legend=True, legend_kwds={'extend':'neither', 'pad':0.02});
ax1[1].set_extent([-125, -110, 41.5, 52.5])
ax1[1].set_title('consecutive years with complete valid daily data', fontsize=9)

fig.savefig(os.path.join(figure_path, f'Fig_nyr_valid_nat_flow.png'), dpi=300)

## 4. Time series during the calibration periods

In [None]:
%matplotlib agg
plt.rcParams.update({'figure.max_open_warning': 0})

start_date = '1980-10-01'
end_date   = '2004-09-30'

for site, data in df_site.iterrows():
    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(7.5, 6.5))
    if site not in impaired_flow['site'].values:
        continue
    impaired_flow.sel(site=site).sel(time=slice(start_date,end_date)).plot(ax=ax[0], color='k', linewidth=0.8, label='daily nat. flow')
    ax[0].set_ylabel('streamflow [m3/s]')
    ax[0].set_xlabel('')
    ax[0].legend();
    impaired_flow.sel(site=site).sel(time=slice(start_date,end_date)).groupby("time.dayofyear").mean().roll(dayofyear=92, roll_coords=False).plot(ax=ax[1], color='k', linewidth=0.8, label='nat. flow')
    ax[1].set_title('')
    ax[1].set_ylabel('streamflow [m3/s]')
    ax[1].set_xlabel('day since Oct 1st')
    fig.savefig(os.path.join(figure_path, 'per_site','nat_flow_%s_%s_%s.png'%(site, start_date, end_date)), dpi=100)
    break