# Process Stations

In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

from climpyrical.gridding import transform_coords, find_element_wise_nearest_pos
from scipy.interpolate import NearestNDInterpolator



### Load station data for processing

In [2]:
# df_path = '/home/nannau/nrc_data/Interim_snow_rain_load_LR_composite_stations_delivered_v3.csv'
# df_path = '/home/nannau/nrc_data/Interim_hdd_Tmax_Tmin_delivered.csv'
df_path = '/home/nannau/nrc_data/Interim_snow_rain_load_LR_composite_stations_tbd_v4.csv'

df = pd.read_csv(df_path, index_col=None).dropna()
df_zero = pd.read_csv(df_path, index_col=None)


# station_dv = 'RL50 (kPa)'
station_dv = 'SL50 (kPa)'

# station_dv = 'HDD (degC-day)'

# transform into rotated pole and assign as new columns
rlon_st, rlat_st = transform_coords(df['lon'].values, df['lat'].values)
df = df.assign(
        rlon=rlon_st, 
        rlat=rlat_st
)

df_zero = df.copy()

df = df[df[station_dv] != 0.0]

  return _prepare_from_string(" ".join(pjargs))


In [3]:
df

Unnamed: 0,id,station_name,province,lon,lat,elev (m),SL50 (kPa),RL50 (kPa),rlon,rlat
0,15,CHEMAINUS,BC,-123.742000,48.935000,75,2.2,0.794,-17.245771,4.407283
1,78,SAANICHTON CDA,BC,-123.419000,48.621700,61,1.4,0.594,-17.148280,4.041212
2,91,SHAWNIGAN LAKE,BC,-123.626000,48.646900,159,2.4,0.818,-17.269400,4.110152
3,109,VICTORIA GONZALES HTS,BC,-123.325000,48.413100,70,1.3,0.387,-17.157965,3.823732
4,149,ALERT BAY,BC,-126.933000,50.583300,59,1.6,0.587,-18.602266,6.674747
...,...,...,...,...,...,...,...,...,...,...
515,1002,ROCKY MTN HOUSE A composite,AB,-114.457750,52.576342,964,2.0,0.000,-10.568321,6.264631
516,1124,OLIVER TREE NURSERY composite,AB,-113.479294,53.586200,676,1.8,0.000,-9.770329,7.123156
517,1734,WAGNER composite,AB,-114.610000,55.367700,584,2.8,0.000,-10.025082,9.005042
518,2075,CAPE HOPES ADVANCE composite,QC,-69.591650,61.066650,52,4.2,0.000,13.391752,15.939087


In [4]:
df.rlat.max()

36.222662646628834

### Open the pre-processed model

In [6]:
# dv = 'Rain-RL50'
# dv = 'heating_degree_days_per_time_period'
dv = 'snw'
ds = xr.open_dataset('./data/processed/pre_processed.nc')
ds

### Match stations to grids in model
Find the nearest grid cells to the station locations

In [7]:
ix, iy = find_element_wise_nearest_pos(
            ds.rlon.values, 
            ds.rlat.values, 
            df.rlon.values, 
            df.rlat.values
)

matched_df = pd.DataFrame({'ilocy': iy, 'ilocx': ix, station_dv: df[station_dv], 'rlat':df.rlat, 'rlon':df.rlon, 'lat': df.lat, 'lon':df.lon, 'station_name':df.station_name})
matched_df = matched_df.groupby(['ilocy', 'ilocx'], as_index=False).agg({station_dv: 'mean', 'rlat':'mean', 'rlon':'mean', 'lat': 'mean', 'lon': 'mean', 'station_name': 'first'})

iy = matched_df.ilocy.values
ix = matched_df.ilocx.values
station_vals = matched_df[station_dv].values

In [8]:
print(station_vals.shape, df.shape)

(518,) (520, 10)


Identify locations where the closest grid cell falls on a invalid model value

In [9]:
nan_index = np.isnan(ds[dv].values[iy, ix])
ixnan, iynan = matched_df.ilocx.values[nan_index], matched_df.ilocy.values[nan_index]
ixnan, iynan

(array([1254, 1313,  377,  343,  345,  310]),
 array([680, 752, 771, 818, 850, 915]))

Create an interpolator that can find the nearest valid model value at these locations

In [10]:
rlon, rlat = np.meshgrid(ds.rlon, ds.rlat)

model_nan = ~np.isnan(ds[dv])
model_vals = ds[dv].values[iy, ix]

f = NearestNDInterpolator(np.stack([rlon[model_nan], rlat[model_nan]]).T, ds[dv].values[model_nan])

Replace those invalid values with their nearest neighbor based on above interpolator.

In [11]:
model_vals[nan_index] = f(np.stack([rlon[iynan, ixnan], rlat[iynan, ixnan]]).T)

Now each station should be matched with a value grid cell value

In [12]:
ratio = matched_df[station_dv]/model_vals
assert not np.any(np.isnan(ratio))

### Normalize the model mean to match that of the station distribution
Find a factor, $\beta$ such that the model mean $$\frac{\mu_m}{\beta} \approx \mu_s$$ where $\mu_s$ is the station mean. 

In [13]:
# choose starting value
station_vals_no_0 = station_vals[station_vals != 0.0]
model_vals_no_0 = model_vals[station_vals != 0.0]
start = np.nanmean(model_vals_no_0/station_vals_no_0)
# enter tolerances
tol = np.linspace(0.0, start+2, 10000)

if np.nanmean(station_vals_no_0 - model_vals_no_0) >= 0:
    rmtols = np.array([np.nanmean(station_vals_no_0 - (model_vals_no_0/t)) for t in tol])
    best_tol = tol[np.where(np.diff(np.sign(rmtols)) != 0.)[0][0]]

if np.nanmean(matched_df[station_dv] - model_vals) < 0:
    rmtols = np.array([np.nanmean(matched_df[station_dv] - (model_vals/t)) for t in tol])
    best_tol = tol[np.where(np.diff(np.sign(rmtols)) != 0.)[0][0]]

print(tol.min(), "<=", best_tol, "<=", tol.max())

# apply correction
model_vals_corrected = model_vals/best_tol
mean_corrected = ds[dv].values/best_tol
ratio = station_vals/model_vals_corrected



# calculate ratios with applied correction
matched_df = matched_df.assign(ratio=ratio)

  rmtols = np.array([np.nanmean(station_vals_no_0 - (model_vals_no_0/t)) for t in tol])


0.0 <= 0.713541841372574 <= 2.7957307491709904


In [14]:
matched_df = matched_df.assign(model_vals = model_vals, best_tol = best_tol)

In [15]:
# final_df = pd.concat([matched_df, df_zero[df_zero[station_dv] == 0.0]], sort=False)

In [16]:
# final_df.to_csv('./data/processed/processed_station.csv')
matched_df.to_csv('./data/processed/processed_station.csv')

matched_df

Unnamed: 0,ilocy,ilocx,SL50 (kPa),rlat,rlon,lat,lon,station_name,ratio,model_vals,best_tol
0,558,1009,1.0,-4.286849,10.373390,42.27560,-82.95560,WINDSOR A,1.184540,0.602379,0.713542
1,559,1009,1.2,-4.227125,10.379271,42.33330,-82.93330,WINDSOR RIVERSIDE,1.413389,0.605814,0.713542
2,576,1017,1.5,-3.494586,10.713492,42.99220,-82.30470,SARNIA AIRPORT,1.448106,0.739112,0.713542
3,577,1048,1.4,-3.434479,12.106219,42.77439,-80.42906,HAGERSVILLE 2 composite,1.192637,0.837605,0.713542
4,579,1046,1.5,-3.362845,11.999925,42.86670,-80.55000,DELHI CDA,1.342166,0.797452,0.713542
...,...,...,...,...,...,...,...,...,...,...,...
513,1269,788,1.5,27.224763,0.601810,74.71690,-94.96940,RESOLUTE CARS,0.688837,1.553797,0.713542
514,1322,640,1.2,29.596811,-5.969242,76.23330,-119.33300,MOULD BAY A,0.650864,1.315559,0.713542
515,1362,741,1.6,31.345756,-1.484925,78.78330,-103.53300,ISACHSEN,0.686823,1.662243,0.713542
516,1391,826,0.9,32.645419,2.272431,79.98330,-85.93330,EUREKA A,0.329829,1.947032,0.713542


In [17]:
matched_df.head()

Unnamed: 0,ilocy,ilocx,SL50 (kPa),rlat,rlon,lat,lon,station_name,ratio,model_vals,best_tol
0,558,1009,1.0,-4.286849,10.37339,42.2756,-82.9556,WINDSOR A,1.18454,0.602379,0.713542
1,559,1009,1.2,-4.227125,10.379271,42.3333,-82.9333,WINDSOR RIVERSIDE,1.413389,0.605814,0.713542
2,576,1017,1.5,-3.494586,10.713492,42.9922,-82.3047,SARNIA AIRPORT,1.448106,0.739112,0.713542
3,577,1048,1.4,-3.434479,12.106219,42.77439,-80.42906,HAGERSVILLE 2 composite,1.192637,0.837605,0.713542
4,579,1046,1.5,-3.362845,11.999925,42.8667,-80.55,DELHI CDA,1.342166,0.797452,0.713542


# NRC Locations

In [18]:
nrc_path = '/home/nannau/nrc_data/NBCC_2020_new_coords.xlsm'
df_nrc = pd.read_excel(nrc_path).dropna()

# fill problem values with better values from 2015
id_typo = df_nrc[(df_nrc['2020 Longitude'] > 0) | (df_nrc['2020 Latitude'] < 40)].index
df_nrc.loc[id_typo, '2020 Longitude'] = df_nrc['2015 Long.'].values[id_typo]
df_nrc.loc[id_typo, '2020 Latitude'] = df_nrc['2015 Lat.'].values[id_typo]

# get rotated pole coordinates
rlon_st, rlat_st = transform_coords(df_nrc['2020 Longitude'].values, df_nrc['2020 Latitude'].values)
df_nrc = df_nrc.assign(
        rlon=rlon_st, 
        rlat=rlat_st
)

  return _prepare_from_string(" ".join(pjargs))


In [19]:
# find indices of the station locations
ix, iy = find_element_wise_nearest_pos(
            ds.rlon.values, 
            ds.rlat.values, 
            df_nrc.rlon.values, 
            df_nrc.rlat.values
)
# write to new dataframe
df_nrc_matched = pd.DataFrame(
    {'ilocy': iy, 
     'ilocx': ix, 
     'Location': df_nrc.Location, 
     'rlat':df_nrc.rlat, 
     'rlon':df_nrc.rlon, 
     'lon': df_nrc['2020 Longitude'], 
     'lat': df_nrc['2020 Latitude']
     }
)

df_nrc_matched.to_csv('./data/processed/nrc_locations.csv')