### first make sure we are in the right environment 

In [1]:
import sys

In [2]:
env = 'climlab'

In [3]:
if not env in sys.executable:
    print(f"Please ensure this notebook is run in the {env} environment")

In [4]:
# Parameters 

# GCM variable 
var_X = 't2m'

# target variable 
target_var = 'TMEAN'

# target type (anomalies ('anomalies') or terciles categories ('cat_3'))
target_type = 'cat_3'

# step: 3 = one month lead time on the next 3 months aggregated statistic
step = 3

### load external modules 

In [5]:
%matplotlib inline

In [6]:
import os
import pathlib
import itertools

In [7]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [8]:
HOME = pathlib.Path.home()

In [9]:
from matplotlib import pyplot as plt

In [10]:
import proplot as plot

In [11]:
import numpy as np
import pandas as pd

In [12]:
import xarray as xr

In [13]:
from sklearn.preprocessing import StandardScaler

### load local modules 

In [14]:
sys.path.append('../../ml4seas/')

In [15]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time, concat_GCMs

xesmf is not installed, using method `interp_like` for interpolation


### domain definitions 

In [16]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

In [17]:
domain_def.keys()

dict_keys(['local', 'regional', 'ext_regional', 'global', 'tropics'])

In [18]:
plot_domains = False

In [19]:
if plot_domains: 
    import cartopy.crs as ccrs
    import cartopy.feature as cfeature
    for k in domain_def.keys(): 
        f, ax = plt.subplots(figsize=(10,8), subplot_kw={'projection':ccrs.PlateCarree(central_longitude=180)})
        ax.set_extent(domain_def[k], crs=ccrs.PlateCarree())
        ax.coastlines(resolution='50m')
#         ax.add_feature(cfeature.LAND)
        ax.add_feature(cfeature.OCEAN)
#         ax.add_feature(cfeature.COASTLINE)
        ax.add_feature(cfeature.BORDERS, linestyle=':')
        f.savefig(f'../../../figures/domain_{k}.png', dpi=200, bbox_inches='tight')  
        plt.close(f)

### set the root path for the `data` folder 

In [20]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP_CFSv2', 'CanCM4i', 'GEM_NEMO', 'NASA_GEOSS2S', 'CanSIPSv2', 'JMA']

### GCM providers 

In [21]:
GCM_provider = {}
GCM_provider['ECMWF'] = 'CDS'
GCM_provider['UKMO'] = 'CDS'
GCM_provider['METEO_FRANCE'] = 'CDS'
GCM_provider['DWD'] = 'CDS'
GCM_provider['CMCC'] = 'CDS'

GCM_provider['NCEP_CFSv2'] = 'IRI'
GCM_provider['CanCM4i'] = 'IRI'
GCM_provider['GEM_NEMO'] = 'IRI'
GCM_provider['NASA_GEOSS2S'] = 'IRI'
GCM_provider['CanSIPSv2'] = 'IRI'

GCM_provider['JMA'] = 'JMA'

### GCM paths 

In [22]:
GCM_path = {}
GCM_path['ECMWF'] = 'gdata'
GCM_path['UKMO'] = 'gdata'
GCM_path['METEO_FRANCE'] = 'gdata'
GCM_path['DWD'] = 'gdata'
GCM_path['CMCC'] = 'local'

GCM_path['NCEP_CFSv2'] = 'gdata'
GCM_path['CanCM4i'] = 'local'
GCM_path['GEM_NEMO'] = 'gdata'
GCM_path['NASA_GEOSS2S'] = 'gdata'
GCM_path['CanSIPSv2'] = 'gdata'

GCM_path['JMA'] = 'gdata'

### check first if there are no issues with each of the GCMs 

In [23]:
test_ind_GCM = False

In [24]:
if test_ind_GCM: 
    coords_dict = {}
    for GCM in GCMs: 
        rpath = set_root_dir(GCM_path[GCM])
        provider = GCM_provider[GCM]
        print('-------------------------------------------------------------------------------')
        print(f"\n\nGCM {GCM}\nrpath set to {str(rpath)}")
        print(f"provider {provider}")
        GCM_dset, coords = get_GCM_outputs(provider=provider, GCM=GCM, var_name=var_X.upper(), rpath=rpath, domain=domain_def['ext_regional'], flatten=False)
        coords_dict[GCM] = coords
        if len(coords['time']) == len(pd.date_range(start=GCM_dset.time.to_index()[0], end=GCM_dset.time.to_index()[-1], freq='MS')): 
            print(f'time coordinate matching for {GCM}')
        else: 
            print(f'issue with time coordinate not matching expected length for {GCM}')
        GCM_dset.close() 

### It seems all OK, now concatenate 

In [25]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, GCM_coords_train, scalers_dict = concat_GCMs(GCMs, var_name=var_X.upper(), period='hindcasts', rpath=GCM_path, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M
number of files in the archive: 288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting UKMO
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M
number of files in the archive: 287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting METEO_FRANCE
reading files from /media/nicolasf/GDATA/END

### Now get the forecasts from the local drive 

In [26]:
rpath = set_root_dir(root='local')

In [27]:
X_data_test, X_index_test, GCM_records_test, GCM_coords_test = concat_GCMs(GCMs, var_name=var_X.upper(), period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M
number of files in the archive: 36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting UKMO
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M
number of files in the archive: 28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting METEO_FRANCE
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs

### coordinates 

In [28]:
GCM_coords_train['ECMWF']['lat'].shape

(4929,)

In [29]:
GCM_coords_train['ECMWF']['lon'].shape

(4929,)

### create a multi-index containing the lat and lon for each column 

In [30]:
latlon = [GCM_coords_train['ECMWF']['lat'].data, GCM_coords_train['ECMWF']['lon'].data]

### casts the training data into a DataFrame 

In [31]:
df_train = pd.DataFrame(data=X_data_train, index=X_index_train, columns=latlon)

### add the column containing the GCM names 

In [32]:
df_train.loc[:,'GCM'] = GCM_records_train

### casts the testing data into a DataFrame 

In [33]:
df_test = pd.DataFrame(data=X_data_test, index=X_index_test, columns=latlon)

In [34]:
df_test.loc[:,'GCM'] = GCM_records_test

### Now do the same as above, but for the standardized version of the training dataset 

### standardized training set 

In [35]:
df_train_std = pd.DataFrame(data=X_data_train_std, index=X_index_train, columns=latlon)

In [36]:
df_train_std.loc[:,'GCM'] = GCM_records_train

### apply the scalers ('trained' over the training set) to the test set 

In [37]:
scalers_dict

{'ECMWF': StandardScaler(),
 'UKMO': StandardScaler(),
 'METEO_FRANCE': StandardScaler(),
 'DWD': StandardScaler(),
 'CMCC': StandardScaler(),
 'NCEP_CFSv2': StandardScaler(),
 'CanCM4i': StandardScaler(),
 'GEM_NEMO': StandardScaler(),
 'NASA_GEOSS2S': StandardScaler(),
 'CanSIPSv2': StandardScaler(),
 'JMA': StandardScaler()}

In [38]:
np.unique(GCM_records_test)

array(['CMCC', 'CanCM4i', 'CanSIPSv2', 'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
       'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO'], dtype='<U12')

### create an emply numpy array with the same shape as X_data_test

In [39]:
X_data_test_std = np.empty_like(X_data_test)

In [40]:
for GCM in GCMs: 
    idx_gcm = (GCM_records_test == GCM)
    X_data_test_std[idx_gcm,:] = scalers_dict[GCM].transform(X_data_test[idx_gcm,:])

In [41]:
df_test_std = pd.DataFrame(data=X_data_test_std, index=X_index_test, columns=latlon)

In [42]:
df_test_std.loc[:,'GCM'] = GCM_records_test

#### pickle the standard scalers 

### access to the means and standard deviations in the scalers are through the `mean_` and `scale_` attributes 

In [43]:
scalers_dict['ECMWF'].mean_.shape

(4929,)

In [44]:
scalers_dict['ECMWF'].scale_.shape

(4929,)

In [45]:
import pickle
with open(f'./GCMs_StandardScalers_{var_X}.pickle', 'wb') as f:
    pickle.dump(scalers_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [46]:
# to open 
# with open(f'./GCMs_StandardScalers_{var_X}.pickle', 'rb') as f: 
#     dict_scalers = pickle.load(f)

### Now fix the last column, from tuple `('GCM', '')` to just 'GCM'

In [47]:
cols = df_train.columns.to_list()

In [48]:
cols[-1] = 'GCM'

In [49]:
df_train.columns = cols
df_train_std.columns = cols
df_test.columns = cols
df_test_std.columns = cols

In [50]:
df_train.shape

(4042, 4930)

In [51]:
df_train_std.shape

(4042, 4930)

In [52]:
GCM_records_train.shape

(4042,)

## TARGETS 

In [53]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [54]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [55]:
targets = pd.concat(targets, axis=1)

In [56]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean,cat_3,anomalies,Tmean,cat_3,anomalies,Tmean,cat_3,anomalies,Tmean,cat_3,anomalies,Tmean,cat_3,anomalies,Tmean,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.334898,3,0.465166,16.082092,3,0.304931,16.679982,3,0.502118,13.732586,2,-0.217639,11.877792,1,-0.564083,13.713081,2,-0.283684
1979-04-30,16.843733,2,0.216447,14.603668,2,0.21687,14.91733,2,0.189383,12.03321,1,-0.446051,10.507519,1,-0.349023,11.984851,2,-0.320514
1979-05-31,14.74733,2,0.276076,12.512171,2,0.376263,12.63052,2,0.167765,9.707791,1,-0.433259,7.936031,1,-0.455407,9.166171,1,-0.602066
1979-06-30,12.155292,2,0.014663,9.86963,2,0.075591,9.890404,1,-0.153712,7.196983,1,-0.37233,5.447644,1,-0.275615,6.650281,1,-0.318733
1979-07-31,10.388794,2,0.012815,8.18276,2,0.123649,8.139223,2,-0.148967,5.561078,2,-0.07733,3.780598,2,0.103711,4.945369,2,0.0918


In [57]:
target_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [58]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [59]:
target_anomalies.columns = target_anomalies.columns.droplevel(1)

In [60]:
target_terciles.columns = target_terciles.columns.droplevel(1)

In [61]:
target_terciles.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,2,1,2
1979-04-30,2,2,2,1,1,2
1979-05-31,2,2,2,1,1,1
1979-06-30,2,2,1,1,1,1
1979-07-31,2,2,2,2,2,2


In [62]:
target_anomalies.head()

Unnamed: 0_level_0,NNI,WNI,ENI,NSI,WSI,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,0.465166,0.304931,0.502118,-0.217639,-0.564083,-0.283684
1979-04-30,0.216447,0.21687,0.189383,-0.446051,-0.349023,-0.320514
1979-05-31,0.276076,0.376263,0.167765,-0.433259,-0.455407,-0.602066
1979-06-30,0.014663,0.075591,-0.153712,-0.37233,-0.275615,-0.318733
1979-07-31,0.012815,0.123649,-0.148967,-0.07733,0.103711,0.0918


### rename the columns for the target anomalies 

In [63]:
target_anomalies.columns = [f"{x}_anoms" for x in target_anomalies.columns]

In [64]:
target_terciles.columns = [f"{x}_cat3_categories" for x in target_terciles.columns]

In [65]:
target_terciles.head()

Unnamed: 0_level_0,NNI_cat3_categories,WNI_cat3_categories,ENI_cat3_categories,NSI_cat3_categories,WSI_cat3_categories,ESI_cat3_categories
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,3,3,3,2,1,2
1979-04-30,2,2,2,1,1,2
1979-05-31,2,2,2,1,1,1
1979-06-30,2,2,1,1,1,1
1979-07-31,2,2,2,2,2,2


In [66]:
target_anomalies.head()

Unnamed: 0_level_0,NNI_anoms,WNI_anoms,ENI_anoms,NSI_anoms,WSI_anoms,ESI_anoms
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-03-31,0.465166,0.304931,0.502118,-0.217639,-0.564083,-0.283684
1979-04-30,0.216447,0.21687,0.189383,-0.446051,-0.349023,-0.320514
1979-05-31,0.276076,0.376263,0.167765,-0.433259,-0.455407,-0.602066
1979-06-30,0.014663,0.075591,-0.153712,-0.37233,-0.275615,-0.318733
1979-07-31,0.012815,0.123649,-0.148967,-0.07733,0.103711,0.0918


### apply the GCM indices, which will have the effect of dupllicating the dates ...

#### training samples 

In [67]:
target_terciles_train = target_terciles.reindex(X_index_train)

In [68]:
target_anomalies_train = target_anomalies.reindex(X_index_train)

#### test period. Note that there will be missing values in the the targets ... 

In [69]:
target_terciles_test = target_terciles.reindex(X_index_test)

In [70]:
target_anomalies_test = target_anomalies.reindex(X_index_test)

### Now concatenate the GCM outputs, and the target terciles and anomalies, along the axis=1 (column)

#### 'raw' anomalies (non-standardized)

In [71]:
df_train_targets = pd.concat([df_train, target_anomalies_train, target_terciles_train], axis=1)

In [72]:
df_test_targets = pd.concat([df_test, target_anomalies_test, target_terciles_test], axis=1)

#### GCM anomalies standardized (per GCM)

In [73]:
df_train_std_targets = pd.concat([df_train_std, target_anomalies_train, target_terciles_train], axis=1)

In [74]:
df_test_std_targets = pd.concat([df_test_std, target_anomalies_test, target_terciles_test], axis=1)

In [75]:
opath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [76]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### truncate to avoid data leakage, the last season of the training should be NDJ 2016 - 2017, and the first season in the test set at the earliest FMA 2017

In [77]:
ltr = []
for gcm in df_train_targets.loc[:,'GCM'].unique():
#     print(gcm)
    df = df_train_targets.loc[df_train_targets.loc[:,'GCM'] == gcm,:]
    df = df.iloc[:-2,:]
    print(df.index[-1])
    ltr.append(df)

2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00


In [78]:
df_train_targets.shape

(4042, 4942)

In [79]:
df_train_targets = pd.concat(ltr, axis=0)

In [80]:
df_train_targets.shape

(4020, 4942)

In [81]:
ltr = []
for gcm in df_train_std_targets.loc[:,'GCM'].unique():
#     print(gcm)
    df = df_train_std_targets.loc[df_train_std_targets.loc[:,'GCM'] == gcm,:]
    df = df.iloc[:-2,:]
    print(df.index[-1])
    ltr.append(df)

2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00
2017-01-31 00:00:00


In [82]:
df_train_std_targets.shape

(4042, 4942)

In [83]:
df_train_std_targets = pd.concat(ltr, axis=0)

In [84]:
df_train_std_targets.shape

(4020, 4942)

### saves in CSVs 

#### 'raw' GCM anomalies version 

In [85]:
df_train_targets = df_train_targets.dropna(axis=0)

In [86]:
df_train_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [87]:
df_test_targets = df_test_targets.dropna(axis=0)

In [88]:
df_test_targets.to_csv(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

#### standardized GCM anomalies version 

In [89]:
df_train_std_targets = df_train_std_targets.dropna(axis=0)

In [90]:
df_train_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.csv')

In [91]:
df_test_std_targets = df_test_std_targets.dropna(axis=0)

In [92]:
df_test_std_targets.to_csv(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.csv')

In [93]:
target_var

'TMEAN'

In [94]:
var_X

't2m'

### saves in PARQUET format 

### transform the columns (multiindex) to string 

In [95]:
tuple_columns = df_train_targets.columns

In [96]:
str_columns = ["{} | {}".format(*x) if isinstance(x, tuple) else x for x in tuple_columns]

In [97]:
df_train_targets.columns = str_columns

In [98]:
df_test_targets.columns = str_columns

In [99]:
df_train_std_targets.columns = str_columns

In [100]:
df_test_std_targets.columns = str_columns

### Now saves 

In [101]:
df_train_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [102]:
df_test_targets.to_parquet(opath / f'GCMs_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')

#### standardized GCM anomalies version 

In [103]:
df_train_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_training_set.parquet')

In [104]:
df_test_std_targets.to_parquet(opath / f'GCMs_std_and_targets_cat3_and_anomalies_{target_var}_test_set.parquet')