## NB5. Analyze site location on MERIT-basin river reaches and its impact on flow bias

Three site location categories:

1. site located close to upstream end
2. site located close to downstream end
3. site located middle of the reach

In [None]:
%matplotlib inline  
import os, sys
import glob
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
from shapely.geometry import Point
import cartopy.crs as ccrs

from scripts.utility import base_map

print("\nThe Python version: %s.%s.%s" % sys.version_info[:3])
print(xr.__name__, xr.__version__)

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

## 1. Setup

In [None]:
# directories
main_path  = '/glade/campaign/ral/hap/mizukami/archive/pnw_hydrology/final_archive_v1' # !!! This is top directory of the dataset.
geo_path   = os.path.join(main_path, 'ancillary_data','geospatial_data')
nrni_path  = os.path.join(main_path, 'ancillary_data')
figure_path = 'NB5_figures'
os.makedirs(figure_path, exist_ok=True)
os.makedirs(os.path.join(figure_path, 'per_site'), exist_ok=True)

In [None]:
# for time series
cal_period = slice('1991-10-01', '2001-09-30')
val_period = slice('2001-10-01', '2010-09-30')

# for skill metrics computation
analysis_period = val_period #slice('1971-10-01', '2010-09-30')

ds_sim = {}
case_meta = {
    'GMET': {'label':'SUMMA-mizuRoute', 'color':'red'},
}

sim_case = list(case_meta.keys())
all_case = sim_case+['obs']

## 2.Load data 

### 2.1 geospatial data - SUMMA catchment and routing chatchment

In [None]:
%%time
df_huc12 = gpd.read_file(os.path.join(geo_path, 'HUC12_MERIT_PNW.gpkg'))
df_huc12['geometry'] = df_huc12.geometry.simplify(0.01) # simplified
#df_huc12_w_cascade = gpd.read_file(os.path.join(geo_path, 'subregion','HUC12_cascade.gpkg'))

### 2.1 geospatial data - nat. flow site (gpkg)

In [None]:
df_site  = gpd.read_file(os.path.join(geo_path, 'PNW_flow_site.gpkg'))
df_site = df_site[df_site['removed']==0]
df_site = df_site.set_index('location_name')
df_site['route_id'] = df_site['route_id'].astype('int')

df_site.drop(['SOLW'], inplace=True)  # this site is likely to be incorrect reach correspondance

df_site_eqd = df_site.to_crs("EPSG:32100") #Lambert Conformal Conic (LCC)
df_site_eqd['coords'] = df_site_eqd.geometry.apply(lambda geom: list(geom.coords)[0])

print('Number of NRNI sites: %d'%len(df_site))

### 2.2 geospatial data - MERIT-basin reach (gpkg)

In [None]:
%%time
df_reach = gpd.read_file(os.path.join(geo_path, 'rivEndoMERITpfaf_PNW.gpkg'))
df_reach = df_reach.set_index('COMID')

df_reach_eqd = df_reach.to_crs("EPSG:32100") #Lambert Conformal Conic (LCC)
df_reach_eqd['coords'] = df_reach_eqd.geometry.apply(lambda geom: list(geom.coords))

### 2.3 geospatial data - MERIT-basin catchment (gpkg)

In [None]:
df_catch = gpd.read_file(os.path.join(geo_path, 'catEndoMERITpfaf_PNW.gpkg'))
df_catch = df_catch.set_index('hruid')

### 2.4 Link between river network reach ID and site name
reach id co-located with flow site has less probably because some reach has more than one sites 

In [None]:
df_merit_id = pd.read_csv(os.path.join(geo_path, 'PNW_flow_site.csv'))
df_merit_id.head()

### 2.5 Read nat. flow data

In [None]:
ds_nrni = xr.open_dataset(os.path.join(nrni_path,'PNW_unimpaired_flow_1951-2018_latlon.nc'))
nrni_site = ds_nrni.site.values
print('Number of nrni sites: %d'%len(nrni_site))

### 2.6 Read retrospective streamflow simulations

In [None]:
%%time
ds_summa = {}
for case,_ in case_meta.items():
    nclist=glob.glob(os.path.join(main_path, f'{case}_hist', 'mizuRoute_daily.nc'))
    ds_tmp = xr.open_mfdataset(nclist, parallel=True)
    ds_tmp = ds_tmp.assign_coords(seg=ds_tmp['reachID'])
    ds_summa[case] = ds_tmp.sel(seg=df_merit_id['route_id'].values).load()
    ds_summa[case] = ds_summa[case].rename_vars({'streamflow':'outflow'})
    ds_summa[case]['inflow'] = ds_summa[case]['outflow']-ds_summa[case]['local_runoff']
    
#print('Number of SUMMA sites: %d'%len(ds_summa[case].reachID))

## 3. Mapping 3 categories of naturalized flow site
    
    1 upstream: site at upstream end (site located from upstream within 30% of total length)
    2 downstream: site at downstream end (site located from downstream within 30% of total length)
    3 middle: site at middle of reach (site located between upstream and downstream threshold)

In [None]:
#using actual distance between downstream node and site closest node
for index, row in df_site_eqd.iterrows():
    
    reach_id = row['route_id']
    coords_line_nodes = df_reach_eqd['coords'].loc[reach_id] # list of coordinates of flow line
    nNodes = len(coords_line_nodes)
    distances = [0.0] 
    for i in range(1, len(coords_line_nodes)):
        x1, y1 = coords_line_nodes[i-1]
        x2, y2 = coords_line_nodes[i]
        segment_length = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
        distances.append(distances[-1] + segment_length)
    
    ixx = 0
    dist = 10000000000 # very large distance measure
    length_from_downstream = 0.0
    
    for ix, node_coords in enumerate(coords_line_nodes): # going from downstream to upstream
        dist0 = (row['coords'][0]-node_coords[0])**2 + (row['coords'][1]-node_coords[1])**2 # distance between site point and current node 
        if dist0<dist:
            dist=dist0
            ixx=ix
   
    length_from_downstream = distances[ixx]
    meters_reach = df_reach_eqd.loc[reach_id].geometry.length
    
    df_site.loc[index, 'dist_from_downstream'] = length_from_downstream
    df_site.loc[index, 'flowline_length'] = meters_reach
    df_site.loc[index, 'lengthkm'] = df_reach_eqd['lengthkm'].loc[reach_id]*1000
    
    if (length_from_downstream/meters_reach<0.3):
        df_site.loc[index, 'location_in_reach'] = 'downstream'
    elif (length_from_downstream/meters_reach>0.7):
        df_site.loc[index, 'location_in_reach'] = 'upstream'
    else:
        df_site.loc[index, 'location_in_reach'] = 'middle'

In [None]:
fig, ax = plt.subplots(1, figsize=(4.75, 5.0), subplot_kw={"projection": ccrs.PlateCarree()}, dpi=150)
base_map(ax, df_huc12)

rect = mpl.patches.Rectangle((-122.4, 46.85), 1.035, 0.75, edgecolor='black', facecolor='none', linewidth=0.7)
# Add the rectangle to the axes
ax.add_patch(rect)

df_site[df_site['location_in_reach']=='upstream'].plot(ax=ax, markersize=5, color='red', zorder=1, legend=True);
df_site[df_site['location_in_reach']=='downstream'].plot(ax=ax, markersize=5, color='blue', zorder=1, legend=True);
df_site[df_site['location_in_reach']=='middle'].plot(ax=ax, markersize=5, color='yellow', zorder=1, legend=True);
ax.set_extent([-125, -110, 41.5, 52.5])

# Manual legend
legend_elements = [
    mpl.lines.Line2D([0], [0], marker='o', color='none', label='Upstream',
           markerfacecolor='red', markeredgecolor='none',markersize=5),
    mpl.lines.Line2D([0], [0], marker='o', color='none', label='Downstream',
           markerfacecolor='blue', markeredgecolor='none',markersize=5),
    mpl.lines.Line2D([0], [0], marker='o', color='none', label='Middle',
           markerfacecolor='yellow', markeredgecolor='none',markersize=5),
]
ax.legend(handles=legend_elements, loc='upper right', frameon=False)

plt.savefig(os.path.join(figure_path, 'flow_site_location.png'), bbox_inches='tight', dpi=300)

In [None]:
fig, ax = plt.subplots(1, figsize=(5, 5), subplot_kw={"projection": ccrs.PlateCarree()}, dpi=150)
df_catch.plot(ax=ax, lw=0.1, facecolor="xkcd:light grey", edgecolor="xkcd:black", zorder=0)
df_reach.plot(ax=ax, lw=0.8, zorder=0);
df_site[df_site['location_in_reach']=='upstream'].plot(ax=ax, markersize=15, color='red', zorder=1);
df_site[df_site['location_in_reach']=='downstream'].plot(ax=ax, markersize=15, color='blue', zorder=1);
df_site[df_site['location_in_reach']=='middle'].plot(ax=ax, markersize=15, color='yellow', zorder=1);
ax.set_extent([-122.4, -121.365, 46.85, 47.6])
plt.savefig(os.path.join(figure_path, 'flow_site_location_zoomed.png'), bbox_inches='tight', dpi=300)

### Map of a selected site

In [None]:
def get_upstream(id, id_all, next_id_all):
    # search immediate upstream elements
    all_ups_ids = [id]
    immediate_ups_ids =  list(np.unique(id_all[np.where(next_id_all==id)]))
    all_ups_ids.extend(immediate_ups_ids)
    round_num = 0 
    print(f'  Outlet ID: {id}')
    while len(immediate_ups_ids) != 0:
    
        round_num = round_num+1
        print("Round %d. Totally %d elements are found." % (round_num, len(all_ups_ids)))
    
        # search upstream elem
        immediate_ups_ids_next = []
        for up_id in immediate_ups_ids:
            immediate_ups_ids_next.extend(list(id_all[np.where(next_id_all==up_id)]))
        immediate_ups_ids_next = np.unique(immediate_ups_ids_next)
    
        # identify if found HUC exists in upstrm_elem
        immediate_ups_ids = [up_id for up_id in immediate_ups_ids_next if not up_id in all_ups_ids]
        all_ups_ids.extend(immediate_ups_ids)
    return all_ups_ids

In [None]:
site_name = 'HAH' #SPEW, ELEW, HAH, REXW

reach_id = df_site.loc[site_name, 'route_id']
upstream_id=get_upstream(reach_id, df_reach.index.values, df_reach['NextDownID'].values)

df_reach_basin = df_reach.loc[upstream_id]
df_cat_basin = df_catch.loc[upstream_id]
df_selected = df_site[df_site.geometry.within(df_cat_basin.dissolve().geometry.iloc[0])]

# plot extent
xmin = df_cat_basin.bounds['minx'].min()
xmax = df_cat_basin.bounds['maxx'].max() 
ymin = df_cat_basin.bounds['miny'].min() 
ymax = df_cat_basin.bounds['maxy'].max()
xbuffer = (xmax-xmin)*0.02
ybuffer = (ymax-ymin)*0.02

# plotting....
fig, ax = plt.subplots(1, figsize=(7.5, 3.5), subplot_kw={"projection": ccrs.PlateCarree()}, dpi=150)
df_cat_basin.plot(ax=ax, lw=0.1, facecolor="xkcd:light grey", edgecolor="xkcd:black", zorder=0)
df_reach_basin.plot(ax=ax, lw=0.3, zorder=1);
df_reach_basin.loc[df_selected['route_id'].values].plot(ax=ax, lw=0.7, color='xkcd:dark blue',zorder=1);
df_reach.plot(ax=ax, lw=0.3, zorder=0);

if 'upstream' in df_selected['location_in_reach'].values:
    df_selected[df_selected['location_in_reach']=='upstream'].plot(ax=ax, markersize=15, color='red', zorder=2);
if 'downstream' in df_selected['location_in_reach'].values:
    df_selected[df_selected['location_in_reach']=='downstream'].plot(ax=ax, markersize=15, color='blue', zorder=2);
if 'middle' in df_selected['location_in_reach'].values:
    df_selected[df_selected['location_in_reach']=='middle'].plot(ax=ax, markersize=15, color='yellow', zorder=2);

for x, y, label in zip(df_selected.geometry.x, df_selected.geometry.y, df_selected.index):
    ax.annotate(label, xy=(x, y), xytext=(3, 3), textcoords="offset points", fontsize=7,) #bbox=dict(facecolor='w', linewidth=0, alpha=0.5)

ax.set_extent([xmin-xbuffer, xmax+xbuffer, ymin-ybuffer, ymax+ybuffer])
plt.savefig(os.path.join(figure_path, f'flow_site_location_{site_name}.png'), bbox_inches='tight', dpi=300)

## 4. Inflow and outflow at a selected site

In [None]:
def is_aj(month):
    return (month >= 4) & (month <= 7)

# ---- setup for summary plot
site_name = 'HAH' #SPEW, ELEW, HAH, REXW
reach_id = df_site.loc[site_name, 'route_id']

period_name = 'validation'
# cal period plot -> slice('1997-10-01', '2001-09-30') for both
# val period plot -> slice('2001-10-01', '2005-09-30') for daily, and  slice('2001-10-01', '2010-09-30') for month
if period_name == 'calibration':
    daily_period = slice('1997-10-01', '2001-09-30')   
    month_period = slice('1991-10-01', '2001-09-30')
elif period_name == 'validation':
    daily_period = slice('2001-10-01', '2005-09-30')   
    month_period = slice('2001-10-01', '2010-09-30')

var_meta = {'outflow':{'color':'red', 'label':'outflow'}, 
            'inflow':{'color':'blue', 'label':'inflow'}, 
           }

# get daily flow 
ds_sim_daily = ds_summa[case].sel(seg=reach_id).sel(time=daily_period)
ds_obs_daily = ds_nrni.sel(site=site_name)['streamflow'].sel(time=daily_period)

# get monthly flow 
ds_sim_month = ds_summa[case].sel(seg=reach_id).sel(time=month_period).resample(time='1ME').mean()
ds_obs_month = ds_nrni.sel(site=site_name)['streamflow'].sel(time=month_period).resample(time='1ME').mean()

# get monthly climatological flow
ds_sim_month_clim     = ds_summa[case].sel(seg=reach_id).sel(time=month_period).groupby('time.month').mean('time')
ds_obs_month_clim = ds_nrni.sel(site=site_name)['streamflow'].sel(time=month_period).groupby('time.month').mean('time')

ds_sim_ann_wy = ds_summa[case].sel(seg=reach_id).sel(time=month_period).resample(time='YS-OCT').mean()
ds_obs_ann_wy = ds_nrni.sel(site=site_name)['streamflow'].sel(time=month_period).resample(time='YS-OCT').mean()

ds_sim_AJ_wy = ds_sim_month.sel(time=is_aj(ds_sim_month['time.month'])).resample(time='YS-OCT').mean()
ds_obs_AJ_wy = ds_obs_month.sel(time=is_aj(ds_obs_month['time.month'])).resample(time='YS-OCT').mean()

# ----- creating figure
fig = plt.figure(figsize=(6.5, 9.5))

AX = mpl.gridspec.GridSpec(4,2)
AX.update(wspace = 0.5, hspace = 0.5)
ax1  = plt.subplot(AX[0,:])
ax2 = plt.subplot(AX[1,:])
ax3 = plt.subplot(AX[2,:])
ax4 = plt.subplot(AX[3,0])
ax5 = plt.subplot(AX[3,1])

# plot monthly
for var in ['outflow', 'inflow']:
    ds_sim_month[var].plot(ax=ax1, c=var_meta[var]['color'], linewidth=1.0, label=var_meta[var]['label'])
ds_obs_month.plot(ax=ax1, color='k', linestyle='dashed', linewidth=0.8, label='obs')
#ax1.axvspan(pd.to_datetime(cal_period.start), pd.to_datetime(cal_period.stop), alpha=0.2, color='gray')  # this is shading period between two datetime

# plot daily calibration period
for var in ['outflow', 'inflow']:
    ds_sim_daily[var].rolling(time=7, center=True).mean().plot(ax=ax2, c=var_meta[var]['color'], linewidth=1.0, label=None, add_legend=False)
ds_obs_daily.rolling(time=7, center=True).mean().plot(ax=ax2, color='k', linestyle='dashed', linewidth=0.8, add_legend=False)
#ax2.axvspan(pd.to_datetime(cal_period.start), pd.to_datetime(cal_period.stop), alpha=0.2, color='gray') # this is shading period between two datetime

# plot monthly long term averages for period
for var in ['outflow', 'inflow']:
    ds_sim_month_clim[var].roll(month=3, roll_coords=False).plot(ax=ax3, c=var_meta[var]['color'], linewidth=1.0, label=None, add_legend=False)
ds_obs_month_clim.roll(month=3, roll_coords=False).plot(ax=ax3, color='k', linestyle='dashed', linewidth=0.8, add_legend=False)
ax3.set_xticks(np.arange(1,13), ['Oct','Nov','Dec','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep'])

# plot scatter for water year mean flow
axmax1 = np.max(ds_obs_ann_wy.values)
ax4.scatter(-10,-10, s=12, c='k', marker='x', label='entire')
ax4.scatter(-10,-10, s=10, c='k', marker='o', label='calib')
for var in ['outflow', 'inflow']:
    axmax1= np.max([axmax1, np.max(ds_sim_ann_wy[var].values)])
    ax4.scatter(ds_sim_ann_wy[var].values, ds_obs_ann_wy.values, s=12, c=var_meta[var]['color'], marker='x', alpha=1.0)
ax4.plot((0, axmax1*1.05), (0, axmax1*1.05), c='orange', linestyle=':', label=None)
#    ax4.annotate('corr: '+str(round(corr_WY[0], 3)), xy=(axmax*0.97, axmax*0.10), horizontalalignment='right')

# plot scatter for spring runoff period (Apr-Jul)
axmax = np.max(ds_obs_AJ_wy.values)
for var in ['outflow', 'inflow']:
    axmax= np.max([axmax, np.max(ds_sim_AJ_wy[var].values)])
    ax5.scatter(ds_sim_AJ_wy[var].values, ds_obs_AJ_wy.values, s=12, c=var_meta[var]['color'], marker='x', alpha=1.0)
ax5.plot((0, axmax*1.05), (0, axmax*1.05), c='orange', linestyle=':', label=None)
#    ax5.annotate('corr: '+str(round(corr_AJ[0], 3)), xy=(axmax*0.97, axmax*0.10), horizontalalignment='right')

# other plot details
ax1.set_xlabel('')
ax1.set_ylabel('Flow, Monthly (cms)')
ax2.set_title('')
ax2.set_xlabel('')
ax2.set_ylabel('Flow, Daily (cms)')
ax3.set_title('',fontsize="small")
ax3.set_ylabel('Flow, Monthly (cms)')
ax3.set_xlabel('Month')
ax4.set_ylabel('WY Obs (cms)')
ax4.set_xlabel('WY Sim (cms)')
if ~np.isnan(axmax1):
    ax4.set_xlim([0, axmax1*1.05])
    ax4.set_ylim([0, axmax1*1.05])
if ~np.isnan(axmax):
    ax5.set_xlim([0, axmax*1.05])
    ax5.set_ylim([0, axmax*1.05])
ax5.set_ylabel('AMJ Obs (cms)')
ax5.set_xlabel('AMJ Sim (cms)')
ax1.legend(loc='best', fontsize="x-small")
#ax4.legend(loc='best', fontsize="small")
#ax2.legend().remove()
ax1.set_title(site_name, fontsize=12)
plotFname=os.path.join(figure_path,f'hydrograph_inflow_outflow_{site_name}.png')
plt.savefig(plotFname, dpi=300, bbox_inches='tight')

## 5. %bias of flows per three categories of site locations

In [None]:
%%time
for index, row in df_site.iterrows():
    reach_id = df_site.loc[index, 'route_id']
    if df_site.loc[index, 'flowDataExist']==1:
        summa_flow = ds_summa[case]['outflow'].sel(seg=reach_id, time=analysis_period).values
        summa_latflow = ds_summa[case]['local_runoff'].sel(seg=reach_id, time=analysis_period).values
        nrni_flow = ds_nrni['streamflow'].sel(site=index, time=analysis_period).values
        df_site.loc[index,'flow_bias'] = np.nansum(summa_flow-nrni_flow)/np.nansum(nrni_flow)*100
        df_site.loc[index,'flow_bias_no_latflow'] = np.nansum(summa_flow-nrni_flow-summa_latflow)/np.nansum(nrni_flow)*100

In [None]:
bias_list = []
for loc in ['downstream', 'middle', 'upstream']:
    data = df_site[(df_site['location_in_reach']==loc) & (df_site['location_longitude']<-121.3) & (df_site['flowDataExist']==1)]['flow_bias'].values
    data = data[~np.isnan(data)]
    bias_list.append(data)

fig, ax = plt.subplots(figsize=(6.5, 4))
data = df_site[(df_site['location_in_reach']=='upstream') & (df_site['location_longitude']<-121.3) & (df_site['flowDataExist']==1)]['flow_bias_no_latflow'].values
data = data[~np.isnan(data)]
bias_list.append(data)

ax.boxplot(bias_list)
plt.grid('major')
ax.set_ylim([-40,100])
ax.set_ylabel('%bias')
ax.set_xticklabels(['case-1', 'case-2', 'case-3', 'case-3 (no_later_flow)'])
ax.set_xlabel('site location in reach')
ax.set_title('');
plt.savefig(os.path.join(figure_path,'Flow_bias_per_site_location_in_reach.png'), bbox_inches='tight', dpi=300)