In [1]:
# Parameters 

# GCM variable 
var_X = 'precip'

# target variable 
target_var = 'RAIN'

# target type (anomalies ('anomalies') or terciles categories ('cat_3'))
target_type = 'cat_3'

# step: 3 = one month lead time on the next 3 months aggregated statistic
step = 3

### load external modules 

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
import itertools

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
from sklearn.preprocessing import StandardScaler

### load local modules 

In [11]:
sys.path.append('../../ml4seas/')

In [12]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time, concat_GCMs

### domain definitions 

In [13]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### set the root path for the `data` folder 

In [14]:
rpath = set_root_dir(root='gdata')

In [15]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP_CFSv2', 'CanCM4i', 'GEM_NEMO', 'NASA_GEOSS2S', 'CanSIPSv2', 'JMA']

In [16]:
rpath = set_root_dir(root='gdata')

In [17]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, GCM_coords_train, scalers_dict = concat_GCMs(GCMs, var_name=var_X.upper(), period='hindcasts', rpath=rpath, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP
number of files in the archive: 288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2016_12.nc

-----------------   getting UKMO
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP
number of files in the archive: 287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2016_12.nc

-----------------   getting METEO_FRANCE
reading files 

### Now get the forecasts from the local drive 

In [18]:
rpath = set_root_dir(root='local')

In [19]:
X_data_test, X_index_test, GCM_records_test, GCM_coords_test = concat_GCMs(GCMs, var_name=var_X.upper(), period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP
number of files in the archive: 36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2019_12.nc

-----------------   getting UKMO
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP
number of files in the archive: 28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2019_12.nc

-----------------   getting METEO_FRANCE
reading files from /home/nicolasf/

### coordinates 

In [20]:
GCM_coords_train['ECMWF']['lat'].shape

(4929,)

In [21]:
GCM_coords_train['ECMWF']['lon'].shape

(4929,)

### create a multi-index containing the lat and lon for each column 

In [22]:
latlon = [GCM_coords_train['ECMWF']['lat'].data, GCM_coords_train['ECMWF']['lon'].data]

### casts the training data into a DataFrame 

In [23]:
df_train = pd.DataFrame(data=X_data_train, index=X_index_train, columns=latlon)

### add the column containing the GCM names 

In [24]:
df_train.loc[:,'GCM'] = GCM_records_train

### casts the testing data into a DataFrame 

In [25]:
df_test = pd.DataFrame(data=X_data_test, index=X_index_test, columns=latlon)

In [26]:
df_test.loc[:,'GCM'] = GCM_records_test

### Now do the same as above, but for the standardized version of the training dataset 

### standardized training set 

In [27]:
df_train_std = pd.DataFrame(data=X_data_train_std, index=X_index_train, columns=latlon)

In [28]:
df_train_std.loc[:,'GCM'] = GCM_records_train

### apply the scalers ('trained' over the training set) to the test set 

In [29]:
scalers_dict

{'ECMWF': StandardScaler(copy=True, with_mean=True, with_std=True),
 'UKMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'METEO_FRANCE': StandardScaler(copy=True, with_mean=True, with_std=True),
 'DWD': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CMCC': StandardScaler(copy=True, with_mean=True, with_std=True),
 'NCEP_CFSv2': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CanCM4i': StandardScaler(copy=True, with_mean=True, with_std=True),
 'GEM_NEMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'NASA_GEOSS2S': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CanSIPSv2': StandardScaler(copy=True, with_mean=True, with_std=True),
 'JMA': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [30]:
np.unique(GCM_records_test)

array(['CMCC', 'CanCM4i', 'CanSIPSv2', 'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
       'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO'], dtype='<U12')

### create an emply numpy array with the same shape as X_data_test

In [31]:
X_data_test_std = np.empty_like(X_data_test)

In [32]:
for GCM in GCMs: 
    idx_gcm = (GCM_records_test == GCM)
    X_data_test_std[idx_gcm,:] = scalers_dict[GCM].transform(X_data_test[idx_gcm,:])

In [33]:
df_test_std = pd.DataFrame(data=X_data_test_std, index=X_index_test, columns=latlon)

In [34]:
df_test_std.loc[:,'GCM'] = GCM_records_test

#### pickle the standard scalers 

### access to the means and standard deviations in the scalers are through the `mean_` and `scale_` attributes 

In [35]:
scalers_dict['ECMWF'].mean_.shape

(4929,)

In [36]:
scalers_dict['ECMWF'].scale_.shape

(4929,)

In [37]:
import pickle
with open(f'./GCMs_StandardScalers_{var_X}.pickle', 'wb') as f:
    pickle.dump(scalers_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
# to open 
# with open(f'./GCMs_StandardScalers_{var_X}.pickle', 'rb') as f: 
#     dict_scalers = pickle.load(f)

### Now fix the last column, from tuple `('GCM', '')` to just 'GCM'

In [39]:
cols = df_train.columns.to_list()

In [40]:
cols[-1] = 'GCM'

In [41]:
df_train.columns = cols
df_train_std.columns = cols
df_test.columns = cols
df_test_std.columns = cols

In [42]:
df_train.shape

(4071, 4930)

In [43]:
df_train_std.shape

(4071, 4930)

In [44]:
GCM_records_train.shape

(4071,)

## TARGETS 

In [45]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [46]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [47]:
targets = pd.concat(targets, axis=1)

In [48]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,470.822673,3,176.364977,357.294404,3,49.163323,472.045206,3,180.723225,498.174979,3,66.560252,783.318801,3,143.904216,221.08315,3,49.95124
1979-04-30,546.72,3,229.631158,440.482174,3,121.697362,492.918286,3,181.818099,564.466858,3,129.367183,690.960327,3,94.298941,215.2389,3,51.450578
1979-05-31,493.002426,3,134.029467,523.61263,3,167.590461,514.622032,3,174.868212,652.528876,3,163.604985,730.426131,3,78.714666,294.27635,3,126.515812
1979-06-30,429.186337,2,13.302417,404.922302,2,-9.359534,306.615937,1,-71.251663,579.009926,2,25.933292,662.720245,2,1.927881,165.33315,2,-0.559937
1979-07-31,549.57599,3,77.774812,407.60008,1,-53.43867,368.72946,1,-65.591395,602.019278,2,18.614145,606.331144,2,-34.887512,187.18665,3,12.60062


In [49]:
target_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [50]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [51]:
target_anomalies.columns = target_anomalies.columns.droplevel(1)

In [52]:
target_terciles.columns = target_terciles.columns.droplevel(1)

In [53]:
target_terciles.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,3,3,3
1979-04-30,3,3,3,3,3,3
1979-05-31,3,3,3,3,3,3
1979-06-30,2,2,1,2,2,2
1979-07-31,3,1,1,2,2,3


In [54]:
target_anomalies.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,176.364977,49.163323,180.723225,66.560252,143.904216,49.95124
1979-04-30,229.631158,121.697362,181.818099,129.367183,94.298941,51.450578
1979-05-31,134.029467,167.590461,174.868212,163.604985,78.714666,126.515812
1979-06-30,13.302417,-9.359534,-71.251663,25.933292,1.927881,-0.559937
1979-07-31,77.774812,-53.43867,-65.591395,18.614145,-34.887512,12.60062


### rename the columns for the target anomalies 

In [55]:
target_anomalies.columns = [f"{x}_anoms" for x in target_anomalies.columns]

In [56]:
target_terciles.columns = [f"{x}_cat3_categories" for x in target_terciles.columns]

In [57]:
target_terciles.head()

Unnamed: 0_level_0,NNI_cat3_categories,WNI_cat3_categories,ENI_cat3_categories,NSI_cat3_categories,WSI_cat3_categories,ESI_cat3_categories
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,3,3,3
1979-04-30,3,3,3,3,3,3
1979-05-31,3,3,3,3,3,3
1979-06-30,2,2,1,2,2,2
1979-07-31,3,1,1,2,2,3


In [58]:
target_anomalies.head()

Unnamed: 0_level_0,NNI_anoms,WNI_anoms,ENI_anoms,NSI_anoms,WSI_anoms,ESI_anoms
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,176.364977,49.163323,180.723225,66.560252,143.904216,49.95124
1979-04-30,229.631158,121.697362,181.818099,129.367183,94.298941,51.450578
1979-05-31,134.029467,167.590461,174.868212,163.604985,78.714666,126.515812
1979-06-30,13.302417,-9.359534,-71.251663,25.933292,1.927881,-0.559937
1979-07-31,77.774812,-53.43867,-65.591395,18.614145,-34.887512,12.60062


### apply the GCM indices, which will have the effect of dupllicating the dates ...

#### training samples 

In [59]:
target_terciles_train = target_terciles.reindex(X_index_train)

In [60]:
target_anomalies_train = target_anomalies.reindex(X_index_train)

#### test period. Note that there will be missing values in the the targets ... 

In [61]:
target_terciles_test = target_terciles.reindex(X_index_test)

In [62]:
target_anomalies_test = target_anomalies.reindex(X_index_test)

### Now concatenate the GCM outputs, and the target terciles and anomalies, along the axis=1 (column)

#### 'raw' anomalies (non-standardized)

In [63]:
df_train_targets = pd.concat([df_train, target_anomalies_train, target_terciles_train], axis=1)

In [64]:
df_test_targets = pd.concat([df_test, target_anomalies_test, target_terciles_test], axis=1)

#### GCM anomalies standardized (per GCM)

In [65]:
df_train_std_targets = pd.concat([df_train_std, target_anomalies_train, target_terciles_train], axis=1)

In [66]:
df_test_std_targets = pd.concat([df_test_std, target_anomalies_test, target_terciles_test], axis=1)

In [67]:
opath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [68]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### saves in CSVs 

#### 'raw' GCM anomalies version 

In [69]:
df_train_targets = df_train_targets.dropna(axis=0)

In [70]:
df_train_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [71]:
df_test_targets = df_test_targets.dropna(axis=0)

In [72]:
df_test_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

#### standardized GCM anomalies version 

In [73]:
df_train_std_targets = df_train_std_targets.dropna(axis=0)

In [74]:
df_train_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [75]:
df_test_std_targets = df_test_std_targets.dropna(axis=0)

In [76]:
df_test_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

### saves in PARQUET format 

### transform the columns (multiindex) to string 

In [77]:
tuple_columns = df_train_targets.columns

In [78]:
str_columns = ["{} | {}".format(*x) if isinstance(x, tuple) else x for x in tuple_columns]

In [79]:
df_train_targets.columns = str_columns

In [80]:
df_test_targets.columns = str_columns

In [81]:
df_train_std_targets.columns = str_columns

In [82]:
df_test_std_targets.columns = str_columns

### Now saves 

In [83]:
df_train_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [84]:
df_test_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')

#### standardized GCM anomalies version 

In [85]:
df_train_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [86]:
df_test_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')

In [87]:
target_var

'RAIN'

In [88]:
var_X

'precip'