### load external modules 

In [1]:
# Parameters 

# target variable 
target_var = 'TMEAN'

# target type (anomalies ('anomalies') or terciles categories ('cat_3'))
target_type = 'cat_3'

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
import itertools

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
from sklearn.preprocessing import StandardScaler

### load local modules 

In [11]:
sys.path.append('../../ml4seas/')

In [12]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time, concat_GCMs

### domain definitions 

In [13]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### set the root path for the `data` folder 

In [14]:
rpath = set_root_dir(root='gdata')

In [15]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP_CFSv2', 'CanCM4i', 'GEM_NEMO', 'NASA_GEOSS2S', 'CanSIPSv2', 'JMA']

In [16]:
step = 3

In [17]:
rpath = set_root_dir(root='gdata')

In [18]:
var_X = 't2m'

In [19]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, GCM_coords_train, scalers_dict = concat_GCMs(GCMs, var_name=var_X.upper(), period='hindcasts', rpath=rpath, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M
number of files in the archive: 288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting UKMO
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M
number of files in the archive: 287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting METEO_FRANCE
reading files from /media/nicolasf/GDATA/END

### Now get the forecasts from the local drive 

In [20]:
rpath = set_root_dir(root='local')

In [21]:
X_data_test, X_index_test, GCM_records_test, GCM_coords_test = concat_GCMs(GCMs, var_name=var_X.upper(), period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M
number of files in the archive: 36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting UKMO
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M
number of files in the archive: 28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting METEO_FRANCE
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs

### coordinates 

In [22]:
GCM_coords_train['ECMWF']['lat'].shape

(4929,)

In [23]:
GCM_coords_train['ECMWF']['lon'].shape

(4929,)

### create a multi-index containing the lat and lon for each column 

In [24]:
latlon = [GCM_coords_train['ECMWF']['lat'].data, GCM_coords_train['ECMWF']['lon'].data]

### casts the training data into a DataFrame 

In [25]:
df_train = pd.DataFrame(data=X_data_train, index=X_index_train, columns=latlon)

### add the column containing the GCM names 

In [26]:
df_train.loc[:,'GCM'] = GCM_records_train

### casts the testing data into a DataFrame 

In [27]:
df_test = pd.DataFrame(data=X_data_test, index=X_index_test, columns=latlon)

In [28]:
df_test.loc[:,'GCM'] = GCM_records_test

### Now do the same as above, but for the standardized version of the training dataset 

### standardized training set 

In [29]:
df_train_std = pd.DataFrame(data=X_data_train_std, index=X_index_train, columns=latlon)

In [30]:
df_train_std.loc[:,'GCM'] = GCM_records_train

### apply the scalers ('trained' over the training set) to the test set 

In [31]:
scalers_dict

{'ECMWF': StandardScaler(copy=True, with_mean=True, with_std=True),
 'UKMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'METEO_FRANCE': StandardScaler(copy=True, with_mean=True, with_std=True),
 'DWD': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CMCC': StandardScaler(copy=True, with_mean=True, with_std=True),
 'NCEP_CFSv2': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CanCM4i': StandardScaler(copy=True, with_mean=True, with_std=True),
 'GEM_NEMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'NASA_GEOSS2S': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CanSIPSv2': StandardScaler(copy=True, with_mean=True, with_std=True),
 'JMA': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [32]:
np.unique(GCM_records_test)

array(['CMCC', 'CanCM4i', 'CanSIPSv2', 'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
       'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO'], dtype='<U12')

### create an emply numpy array with the same shape as X_data_test

In [33]:
X_data_test_std = np.empty_like(X_data_test)

In [34]:
for GCM in GCMs: 
    idx_gcm = (GCM_records_test == GCM)
    X_data_test_std[idx_gcm,:] = scalers_dict[GCM].transform(X_data_test[idx_gcm,:])

In [35]:
df_test_std = pd.DataFrame(data=X_data_test_std, index=X_index_test, columns=latlon)

In [36]:
df_test_std.loc[:,'GCM'] = GCM_records_test

#### pickle the standard scalers 

### access to the means and standard deviations in the scalers are through the `mean_` and `scale_` attributes 

In [37]:
scalers_dict['ECMWF'].mean_.shape

(4929,)

In [38]:
scalers_dict['ECMWF'].scale_.shape

(4929,)

In [39]:
import pickle
with open('./GCMs_StandardScalers.pickle', 'wb') as f:
    pickle.dump(scalers_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

### Now fix the last column, from tuple `('GCM', '')` to just 'GCM'

In [40]:
cols = df_train.columns.to_list()

In [41]:
cols[-1] = 'GCM'

In [42]:
df_train.columns = cols
df_train_std.columns = cols
df_test.columns = cols
df_test_std.columns = cols

In [43]:
# to open 
# with open('./GCMs_StandardScalers.pickle', 'rb') as f: 
#     dict_scalers = pickle.load(f)

In [44]:
df_train.shape

(3982, 4930)

In [45]:
df_train_std.shape

(3982, 4930)

In [46]:
GCM_records_train.shape

(3982,)

In [47]:
X_index_train.shape

(3982,)

## TARGETS 

In [48]:
target_var = 'TMEAN'

In [49]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [50]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [51]:
targets = pd.concat(targets, axis=1)

In [52]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.278555,3,0.462528,16.051472,3,0.317965,16.732249,3,0.62154,13.811438,2,-0.093327,11.848419,1,-0.400334,13.728706,2,-0.222255
1979-04-30,16.794408,2,0.227319,14.586906,3,0.248038,14.953599,3,0.299057,12.18945,1,-0.249176,10.58958,2,-0.110858,12.033578,2,-0.208919
1979-05-31,14.695903,2,0.282907,12.52232,3,0.425773,12.716266,2,0.314655,9.888897,1,-0.215657,8.099501,1,-0.202497,9.232035,1,-0.470303
1979-06-30,12.093823,2,-0.001099,9.888909,2,0.117671,9.929897,1,-0.065854,7.19898,1,-0.300772,5.457298,1,-0.197458,6.634168,1,-0.254247
1979-07-31,10.290536,2,-0.061355,8.182231,2,0.120974,8.208954,2,-0.063564,5.534868,2,-0.058724,3.763353,2,0.085515,4.916423,2,0.112719


In [53]:
target_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [54]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [55]:
target_anomalies.columns = target_anomalies.columns.droplevel(1)

In [56]:
target_terciles.columns = target_terciles.columns.droplevel(1)

In [57]:
target_terciles.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,2,1,2
1979-04-30,2,3,3,1,2,2
1979-05-31,2,3,2,1,1,1
1979-06-30,2,2,1,1,1,1
1979-07-31,2,2,2,2,2,2


In [58]:
target_anomalies.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,0.462528,0.317965,0.62154,-0.093327,-0.400334,-0.222255
1979-04-30,0.227319,0.248038,0.299057,-0.249176,-0.110858,-0.208919
1979-05-31,0.282907,0.425773,0.314655,-0.215657,-0.202497,-0.470303
1979-06-30,-0.001099,0.117671,-0.065854,-0.300772,-0.197458,-0.254247
1979-07-31,-0.061355,0.120974,-0.063564,-0.058724,0.085515,0.112719


### rename the columns for the target anomalies 

In [59]:
target_anomalies.columns = [f"{x}_anoms" for x in target_anomalies.columns]

In [60]:
target_terciles.columns = [f"{x}_cat3_categories" for x in target_terciles.columns]

In [61]:
target_terciles.head()

Unnamed: 0_level_0,NNI_cat3_categories,WNI_cat3_categories,ENI_cat3_categories,NSI_cat3_categories,WSI_cat3_categories,ESI_cat3_categories
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,2,1,2
1979-04-30,2,3,3,1,2,2
1979-05-31,2,3,2,1,1,1
1979-06-30,2,2,1,1,1,1
1979-07-31,2,2,2,2,2,2


In [62]:
target_anomalies.head()

Unnamed: 0_level_0,NNI_anoms,WNI_anoms,ENI_anoms,NSI_anoms,WSI_anoms,ESI_anoms
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,0.462528,0.317965,0.62154,-0.093327,-0.400334,-0.222255
1979-04-30,0.227319,0.248038,0.299057,-0.249176,-0.110858,-0.208919
1979-05-31,0.282907,0.425773,0.314655,-0.215657,-0.202497,-0.470303
1979-06-30,-0.001099,0.117671,-0.065854,-0.300772,-0.197458,-0.254247
1979-07-31,-0.061355,0.120974,-0.063564,-0.058724,0.085515,0.112719


### apply the GCM indices, which will have the effect of dupllicating the dates ...

#### training samples 

In [63]:
target_terciles_train = target_terciles.loc[X_index_train,:]

In [64]:
target_anomalies_train = target_anomalies.loc[X_index_train,:]

#### test period. Note that there will be missing values in the the targets ... 

In [65]:
target_terciles_test = target_terciles.reindex(X_index_test)

In [66]:
target_anomalies_test = target_anomalies.reindex(X_index_test)

### Now concatenate the GCM outputs, and the target terciles and anomalies, along the axis=1 (column)

#### 'raw' anomalies (non-standardized)

In [67]:
df_train_targets = pd.concat([df_train, target_anomalies_train, target_terciles_train], axis=1)

In [68]:
df_test_targets = pd.concat([df_test, target_anomalies_test, target_terciles_test], axis=1)

#### GCM anomalies standardized (per GCM)

In [69]:
df_train_std_targets = pd.concat([df_train_std, target_anomalies_train, target_terciles_train], axis=1)

In [70]:
df_test_std_targets = pd.concat([df_test_std, target_anomalies_test, target_terciles_test], axis=1)

In [71]:
opath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [72]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### saves in CSVs 

#### 'raw' GCM anomalies version 

In [73]:
df_train_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [75]:
df_test_targets = df_test_targets.dropna(axis=0)

In [77]:
df_test_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

#### standardized GCM anomalies version 

In [78]:
df_train_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [80]:
df_test_std_targets = df_test_std_targets.dropna(axis=0)

In [82]:
df_test_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

### saves in PARQUET format 

### transform the columns (multiindex) to string 

In [83]:
tuple_columns = df_train_targets.columns

In [84]:
str_columns = ["{} | {}".format(*x) if isinstance(x, tuple) else x for x in tuple_columns]

In [85]:
df_train_targets.columns = str_columns

In [86]:
df_test_targets.columns = str_columns

In [87]:
df_train_std_targets.columns = str_columns

In [88]:
df_test_std_targets.columns = str_columns

### Now saves 

In [89]:
df_train_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [90]:
df_test_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')

#### standardized GCM anomalies version 

In [91]:
df_train_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [92]:
df_test_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')