# calculates the seasonal anomalies and the seasonal percentile categories from the gridded VCSN monthly files, using the NZ 6 regions shapefiles and the [salem](https://salem.readthedocs.io/en/stable/) library for spatial extraction

In [1]:
# Paramaters 

# var_name = 'Rain_bc'
# var_name = 'Tmin_N'
# var_name = 'Tmax_N'
# var_name = 'Tmean_N'
var_name = 'SoilM'
# var_name = 'Wind'
# var_name = 'Rad'

in ['Agent', 'Lat', 'Longt', 'Date', 'MSLP', 'PET', 'Rain', 'RH', 'SoilM',
       'ETmp', 'Rad', 'TMax', 'Tmin', 'VP', 'Wind', 'Rain_bc', 'Tmax_N',
       'Tmin_N']

### number of quantiles 

In [2]:
num_quantiles = 3

In [3]:
import os
import sys
import pathlib

In [4]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from itertools import product

In [5]:
import salem
import geopandas as gpd

In [6]:
salem.__version__

'0.3.0-1-g697762b'

In [7]:
import xarray as xr

### function to calculate the anomalies with respect to the 1981 - 2010 climatology 

In [8]:
def demean(x): 
    return x - x.loc['1981':'2010',].mean()

In [9]:
var_name

'SoilM'

### big_var is the simplified version of the variable, so Rain_bc --> RAIN

In [10]:
big_var = var_name.split('_')[0].upper()

In [11]:
HOME = pathlib.Path.home()

In [12]:
dpath = HOME / 'operational/VCSN/data/NC/MONTHLY/' / var_name.upper()

In [13]:
var_name.upper()

'SOILM'

In [14]:
dpath

PosixPath('/home/nicolasf/operational/VCSN/data/NC/MONTHLY/SOILM')

In [15]:
dset = salem.open_xr_dataset(dpath / f'VCSN_gridded_{var_name}_1979-01_2019-12.nc') 

In [16]:
dset

### calculates the seasonal average (or sum if Rain_bc is the variable )

In [17]:
if var_name == 'Rain_bc': 
    dset = dset.rolling(time=3, min_periods=3).sum()
else: 
    dset = dset.rolling(time=3, min_periods=3).mean()

In [18]:
dset = dset.isel(time=slice(2,None))

In [19]:
nz_regions = gpd.read_file(HOME / 'research' / 'Smart_Ideas' / 'data' / 'shapefiles' / 'NZ_regions' / 'NZ_6_regions' / 'NZ_regions_corrected.shp') 

In [20]:
nz_regions

Unnamed: 0,OBJECTID,Id,gridcode,Shape_Leng,Shape_Area,Location,geometry
0,1,1,1,85.215338,5.032753,NNI,"MULTIPOLYGON (((174.70530 -38.17377, 174.70545..."
1,2,2,2,12.336015,2.994028,WNI,"MULTIPOLYGON (((175.13516 -41.37745, 175.13507..."
2,3,3,3,14.235493,3.775388,ENI,"MULTIPOLYGON (((175.85595 -41.35970, 175.85595..."
3,4,4,4,34.656463,3.06628,NSI,"MULTIPOLYGON (((171.32620 -42.12355, 171.32602..."
4,5,5,6,20.191504,4.827228,ESI,"MULTIPOLYGON (((170.21675 -46.05955, 170.21609..."
5,6,6,5,42.941379,9.05741,WSI,"MULTIPOLYGON (((169.20749 -46.66371, 169.20742..."


#### checks that the crs is correct: should be epsg 4272 

In [21]:
nz_regions.crs

<Geographic 2D CRS: EPSG:4272>
Name: NZGD49
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: New Zealand - onshore and nearshore
- bounds: (165.87, -47.65, 179.27, -33.89)
Datum: New Zealand Geodetic Datum 1949
- Ellipsoid: International 1924
- Prime Meridian: Greenwich

In [22]:
opath_root = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [23]:
if not opath_root.exists(): 
    opath_root.mkdir(parents=True)

### get the quantiles values 

In [24]:
quant_values = np.linspace(0, 1, num_quantiles + 1, endpoint=True)

In [25]:
quant_values = quant_values[1:-1]

In [26]:
quant_values

array([0.33333333, 0.66666667])

In [27]:
col_labs = [f"Q{int(x)}" for x in (quant_values*100)]

In [28]:
col_labs

['Q33', 'Q66']

In [29]:
# f, axes = plt.subplots(nrows=3, ncols=2)
# axes = axes.flatten()

quantiles_dict = {}

for i, region_name in enumerate(['NNI','ENI','WNI','NSI','WSI','ESI']): 
    
    shape = nz_regions.query(f"Location == '{region_name}'")
    
    region = dset.salem.subset(shape=shape)

    region = region.salem.roi(shape=shape, all_touched=True)
        
    ts = region.mean(dim=['lat','lon'])
    
    ts_df = ts[var_name].to_dataframe()
            
    ts_series = ts_df.loc[:,var_name]
    
    ts_series_cat = []
    
    quantiles_list = []
    
    for month in range(1, 13):
        
        ts_series_m = ts_series[ts_series.index.month == month]
        
        clim = ts_series_m.loc['1981':'2010']
        
        quantiles = [clim.quantile(q=q) for q in quant_values.tolist()]
        
        quantiles_list.append(quantiles.copy())
        
        quantiles.insert(0, -np.inf)
        
        quantiles.append(np.inf)
        
        ts_series_m_cats = pd.cut(ts_series_m, quantiles, labels=list(range(1, num_quantiles + 1)))
        
        ts_series_cat.append(ts_series_m_cats)
        
        del(quantiles)
     
    quantiles_dict[region_name]  = np.array(quantiles_list)
    
    ts_series_cat = pd.concat(ts_series_cat, axis=0)
    
    ts_series_cat = ts_series_cat.sort_index()
    
    ts_df.loc[:,f'cat_{num_quantiles}'] = ts_series_cat
    
    ts_df.loc[:,'anomalies'] = ts_df.loc[:,var_name].groupby(ts_df.index.month).apply(demean)
    
    opath = opath_root / big_var / region_name 
    
    if not opath.exists(): 
        opath.mkdir(parents=True)
        
    ts_df.to_csv(opath / f'TS_NZ_region_{region_name}_{big_var}_{num_quantiles}_quantiles_anoms_salem.csv')
    
    #descriptive statistics per quantile category 
    
    ts_df.groupby(ts_df.loc[:,f'cat_{num_quantiles}']).describe().to_csv(opath / f'descriptive_stats_{region_name}_{big_var}_{num_quantiles}_salem.csv')
    
    print(f"region {region_name} processed for variable {big_var}")
    

region NNI processed for variable SOILM
region ENI processed for variable SOILM
region WNI processed for variable SOILM
region NSI processed for variable SOILM
region WSI processed for variable SOILM
region ESI processed for variable SOILM


### saves the climatological terciles calculated from the VCSN regional aggregates

In [30]:
quantiles_list = []
for region_name in ['NNI','ENI','WNI','NSI','WSI','ESI']: 
    df = pd.DataFrame(quantiles_dict[region_name])
    df.index = range(1, 13)
    df.index.name = 'season'
    df.columns = pd.MultiIndex.from_product([[region_name],col_labs])
    quantiles_list.append(df)

In [31]:
quantiles_df = pd.concat(quantiles_list, axis=1)

In [32]:
quantiles_df

Unnamed: 0_level_0,NNI,NNI,ENI,ENI,WNI,WNI,NSI,NSI,WSI,WSI,ESI,ESI
Unnamed: 0_level_1,Q33,Q66,Q33,Q66,Q33,Q66,Q33,Q66,Q33,Q66,Q33,Q66
season,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-59.760434,-49.329732,-71.049419,-63.478599,-46.43345,-36.407573,-48.200922,-34.984786,-33.104555,-20.340846,-94.213616,-78.157611
2,-76.957195,-64.998489,-85.692645,-66.955892,-57.709398,-52.246125,-61.134943,-44.607541,-38.667973,-28.786073,-107.721089,-90.494181
3,-83.644839,-66.125541,-81.544635,-74.477451,-69.640383,-57.527364,-63.92252,-45.421097,-40.122126,-32.072099,-107.673662,-95.994379
4,-70.746852,-54.781437,-71.031433,-55.809062,-67.245428,-50.254658,-52.121334,-43.727266,-30.649353,-25.728731,-99.816814,-87.308703
5,-49.126759,-35.107259,-49.421125,-35.710147,-44.53037,-33.660237,-35.89036,-25.282803,-18.388743,-10.571315,-83.71466,-70.25801
6,-23.099807,-16.003428,-28.856783,-13.019453,-19.648744,-13.63877,-18.414458,-11.153985,-4.572314,0.673412,-59.429168,-48.459241
7,-5.568182,-1.945869,-9.365873,-2.793691,-3.857242,-0.587252,-5.406856,-1.886923,1.721164,6.591637,-39.724923,-26.030122
8,1.189106,2.369429,-1.183007,1.475655,0.705796,3.841611,-0.102472,1.346436,3.997687,6.629726,-25.381036,-12.08625
9,-1.094848,0.848821,-3.580632,-1.508376,-0.359593,2.211302,-1.311551,0.721184,2.565583,4.857299,-22.480722,-12.367352
10,-7.29313,-4.571284,-13.823314,-9.300914,-3.985524,-1.125532,-5.838418,-2.244981,-0.359021,2.545731,-32.587097,-23.098277


In [33]:
quantiles_df.to_csv(opath.parent / f'Climatological_quantiles_{num_quantiles}_cat_{big_var}_salem.csv')