# Process Stations

In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

from climpyrical.gridding import transform_coords, find_element_wise_nearest_pos
from scipy.interpolate import NearestNDInterpolator

### Load station data for processing

In [43]:
df_path = '/home/nannau/nrc_data/Interim_snow_rain_load_LR_composite_stations_delivered_v3.csv'

df = pd.read_csv(df_path, index_col=None)
df_zero = pd.read_csv(df_path, index_col=None)


station_dv = 'RL50 (kPa)'

# transform into rotated pole and assign as new columns
rlon_st, rlat_st = transform_coords(df['lon'].values, df['lat'].values)
df = df.assign(
        rlon=rlon_st, 
        rlat=rlat_st
)

df_zero = df.copy()


df = df[df[station_dv] != 0.0]

In [44]:
df.rlat.max()

27.224762925109236

### Open the pre-processed model

In [45]:
dv = 'Rain-RL50'
ds = xr.open_dataset('./data/processed/pre_processed.nc')

### Match stations to grids in model
Find the nearest grid cells to the station locations

In [46]:
ix, iy = find_element_wise_nearest_pos(
            ds.rlon.values, 
            ds.rlat.values, 
            df.rlon.values, 
            df.rlat.values
)

matched_df = pd.DataFrame({'ilocy': iy, 'ilocx': ix, station_dv: df[station_dv], 'rlat':df.rlat, 'rlon':df.rlon, 'lat': df.lat, 'lon':df.lon, 'station_name':df.station_name})
matched_df = matched_df.groupby(['ilocy', 'ilocx'], as_index=False).agg({station_dv: 'mean', 'rlat':'mean', 'rlon':'mean', 'lat': 'mean', 'lon': 'mean', 'station_name': 'first'})

iy = matched_df.ilocy.values
ix = matched_df.ilocx.values
station_vals = matched_df[station_dv].values

Identify locations where the closest grid cell falls on a invalid model value

In [47]:
nan_index = np.isnan(ds[dv].values[iy, ix])
ixnan, iynan = matched_df.ilocx.values[nan_index], matched_df.ilocy.values[nan_index]
ixnan, iynan

(array([1313,  377,  343,  345,  310]), array([752, 771, 818, 850, 915]))

Create an interpolator that can find the nearest valid model value at these locations

In [48]:
rlon, rlat = np.meshgrid(ds.rlon, ds.rlat)

model_nan = ~np.isnan(ds[dv])
model_vals = ds[dv].values[iy, ix]

f = NearestNDInterpolator(np.stack([rlon[model_nan], rlat[model_nan]]).T, ds[dv].values[model_nan])

Replace those invalid values with their nearest neighbor based on above interpolator.

In [49]:
model_vals[nan_index] = f(np.stack([rlon[iynan, ixnan], rlat[iynan, ixnan]]).T)

Now each station should be matched with a value grid cell value

In [50]:
ratio = matched_df[station_dv]/model_vals
assert not np.any(np.isnan(ratio))

### Normalize the model mean to match that of the station distribution
Find a factor, $\beta$ such that the model mean $$\frac{\mu_m}{\beta} \approx \mu_s$$ where $\mu_s$ is the station mean. 

In [51]:
# choose starting value
start = np.nanmean(model_vals/station_vals)
# enter tolerances
tol = np.linspace(start-0.8, start+0.8, 1000)

if np.nanmean(matched_df[station_dv] - model_vals) >= 0:
    rmtols = np.array([np.nanmean(matched_df[station_dv] - (model_vals/t)) for t in tol])
    best_tol = tol[np.where(np.diff(np.sign(rmtols)) != 0.)[0][0]]

if np.nanmean(matched_df[station_dv] - model_vals) < 0:
    rmtols = np.array([np.nanmean(matched_df[station_dv] - (model_vals/t)) for t in tol])
    best_tol = tol[np.where(np.diff(np.sign(rmtols)) != 0.)[0][0]]

print(tol.min(), "<=", best_tol, "<=", tol.max())

# apply correction
model_vals_corrected = model_vals/best_tol
mean_corrected = ds[dv].values/best_tol
ratio = model_vals_corrected/station_vals

# calculate ratios with applied correction
matched_df = matched_df.assign(ratio=ratio)

0.07631750039103402 <= 0.8226638467373804 <= 1.676317500391034


In [52]:
final_df = pd.concat([matched_df, df_zero[df_zero[station_dv] == 0.0]], sort=False)

In [53]:
final_df.to_csv('./data/processed/processed_station.csv')
final_df

Unnamed: 0,ilocy,ilocx,RL50 (kPa),rlat,rlon,lat,lon,station_name,ratio,id,province,elev (m),SL50 (kPa)
0,558.0,1009.0,0.402,-4.286849,10.373390,42.275600,-82.955600,WINDSOR A,0.853905,,,,
1,559.0,1009.0,0.484,-4.227125,10.379271,42.333300,-82.933300,WINDSOR RIVERSIDE,0.707769,,,,
2,576.0,1017.0,0.389,-3.494586,10.713492,42.992200,-82.304700,SARNIA AIRPORT,0.894242,,,,
3,577.0,1048.0,0.470,-3.434479,12.106219,42.774390,-80.429060,HAGERSVILLE 2 composite,0.830162,,,,
4,579.0,1046.0,0.587,-3.362845,11.999925,42.866700,-80.550000,DELHI CDA,0.656625,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,,,0.000,6.264631,-10.568321,52.576342,-114.457750,ROCKY MTN HOUSE A composite,,1002.0,AB,964.0,2.0
516,,,0.000,7.123156,-9.770329,53.586200,-113.479294,OLIVER TREE NURSERY composite,,1124.0,AB,676.0,1.8
517,,,0.000,9.005042,-10.025082,55.367700,-114.610000,WAGNER composite,,1734.0,AB,584.0,2.8
518,,,0.000,15.939087,13.391752,61.066650,-69.591650,CAPE HOPES ADVANCE composite,,2075.0,QC,52.0,4.2


In [35]:
matched_df.head()

Unnamed: 0,ilocy,ilocx,RL50 (kPa),rlat,rlon,lat,lon,station_name,ratio
0,558,1009,0.402,-4.286849,10.37339,42.2756,-82.9556,WINDSOR A,0.853905
1,559,1009,0.484,-4.227125,10.379271,42.3333,-82.9333,WINDSOR RIVERSIDE,0.707769
2,576,1017,0.389,-3.494586,10.713492,42.9922,-82.3047,SARNIA AIRPORT,0.894242
3,577,1048,0.47,-3.434479,12.106219,42.77439,-80.42906,HAGERSVILLE 2 composite,0.830162
4,579,1046,0.587,-3.362845,11.999925,42.8667,-80.55,DELHI CDA,0.656625


# NRC Locations

In [26]:
nrc_path = '/home/nannau/nrc_data/NBCC_2020_new_coords.xlsm'
df_nrc = pd.read_excel(nrc_path).dropna()

# fill problem values with better values from 2015
id_typo = df_nrc[(df_nrc['2020 Longitude'] > 0) | (df_nrc['2020 Latitude'] < 40)].index
df_nrc.loc[id_typo, '2020 Longitude'] = df_nrc['2015 Long.'].values[id_typo]
df_nrc.loc[id_typo, '2020 Latitude'] = df_nrc['2015 Lat.'].values[id_typo]

# get rotated pole coordinates
rlon_st, rlat_st = transform_coords(df_nrc['2020 Longitude'].values, df_nrc['2020 Latitude'].values)
df_nrc = df_nrc.assign(
        rlon=rlon_st, 
        rlat=rlat_st
)

In [31]:
# find indices of the station locations
ix, iy = find_element_wise_nearest_pos(
            ds.rlon.values, 
            ds.rlat.values, 
            df_nrc.rlon.values, 
            df_nrc.rlat.values
)
# write to new dataframe
df_nrc_matched = pd.DataFrame(
    {'ilocy': iy, 
     'ilocx': ix, 
     'Location': df_nrc.Location, 
     'rlat':df_nrc.rlat, 
     'rlon':df_nrc.rlon, 
     '2020 Longitude': df_nrc['2020 Longitude'], 
     '2020 Latitude': df_nrc['2020 Latitude']
     }
)

df_nrc_matched.to_csv('./data/processed/nrc_locations.csv')