In [21]:
# Parameters 

provider = 'CDS'
var_X = 't2m'
domain = 'ext_regional'
target_var = 'TMEAN'
target_type = 'cat_3'
region_name = 'NNI'

### load external modules 

In [22]:
%matplotlib inline

In [23]:
import os
import sys 
import pathlib
from shutil import copytree, rmtree
import itertools

In [24]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [25]:
HOME = pathlib.Path.home()

In [26]:
from matplotlib import pyplot as plt

In [27]:
import proplot as plot

In [28]:
import numpy as np
import pandas as pd

In [29]:
import xarray as xr

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

### load local modules 

In [31]:
sys.path.append('../../../ml4seas/')

In [32]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time

### domain definitions 

In [33]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### set the root path for the `data` folder 

In [34]:
rpath = set_root_dir(root='gdata')

In [35]:
provider = 'CDS'

In [36]:
GCMs = ['ECMWF','UKMO','METEO_FRANCE']
# GCMs = ['ECMWF']

In [37]:
step = 3

In [38]:
# %%writefile ../../../ml4seas/GCM/concat_GCMs.py 
def concat_GCMs(provider, GCMs, var_name='T2M', period='hindcasts', rpath=None, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3): 
    """
    Returns many GCM outputs concatenated along the time dimension
    
    Parameters
    ----------
    
    - provider : str, the provider in ['CDS','IRI','JMA'], no default 
    - GCMs : list, a list of GCMs in the provider 
    - period : the period to extract, in ['hindcasts','forecasts']
    - rpath : str or pathlib.Path, the path to the 'data' folder 
    - domain : the domain, in ['local','regional','ext_regional', 'global', 'tropics']
    - standardize : Boolean, must be True for 'hindcasts', False for 'forecasts'
    - flatten : Boolean, whether or not to flatten the outputs along the spatial (and optionally members) dimension, default to True
    - ensmean : Boolean, whether or not to calculate the ensemble mean, default to True
    - step : the number of step by which to shift the time index, to align with observed target, default to 3 (assumes seasonal anomalies)
    
    Return
    ------
    
    - X_data_l : numpy.array containing the data concatenated along the time dimension (axis=0)
    - X_data_l_std : if standardized=True, numpy.array containing the standardized data concatenated along the time dimension (axis=0)
    - X_index_l :  numpy.array of Python datatimes, containing the index (note that repeated values will be present)
    - GCM_records : numpy.array of len(X_index_l) containing the string for the corresponding GCM
    - scalers_dict : if standardized=True, dictionnary, with each item (key = GCM) corresponding to fitted scikit-learn StandardScaler() object
    
    
    """
    
    import sys
    import pathlib
    import itertools
    import numpy as np
    
    HOME = pathlib.Path.home()
    
    sys.path.append(HOME / 'research' / 'Smart_Ideas' / 'code' / 'ml4seas')
    
    from utils import set_root_dir
    from GCM import get_GCM_outputs, shift_dset_time
    
    GCM_records = []
    X_index_l = []
    X_data_l = []
    X_data_l_std = []

    domain_def = {}
    domain_def['local'] = [150, 200, -50, -10]
    domain_def['regional'] = [90, 300, -65, 50]
    domain_def['ext_regional'] = [70, 300, -70, 60]
    # domain_def['ext_regional'] = [50, 300, -75, 60]
    domain_def['global'] = [0, 360, -70, 70]
    domain_def['tropics'] = [0, 360, -40, 40]    

    if standardize: 
        
        scalers_dict = {}
    
    if isinstance(rpath, str): 
        rpath = pathlib.Path(rpath)
    
    for GCM in GCMs: 
    
        print(f"getting {GCM}")
    
        dset, coords = get_GCM_outputs(provider=provider, GCM=GCM, var_name=var_name, period=period, rpath=rpath, domain=domain_def[domain], step=step, flatten=flatten, ensmean=ensmean)
        
        if 'valid_time' in dset.coords: 
            dset = dset.drop('valid_time')        
            
        dset = shift_dset_time(dset, step=step)
        
        X_data = dset['t2m'].data
        
        X_index = dset['time'].to_index().to_pydatetime()
        
        if standardize: 
        
            scaler = StandardScaler() 

            scaler = scaler.fit(X_data)

            scalers_dict[GCM] = scaler

            X_data_std = scaler.transform(X_data)

        # append and records 
        
        GCM_records.append(np.repeat([GCM], len(X_index)))
        
        X_index_l.append(X_index)
        
        X_data_l.append(X_data)

        if standardize: 
        
            X_data_l_std.append(X_data_std)
        
    GCM_records = np.array(list(itertools.chain(*GCM_records)))

    X_index_l = np.array(list(itertools.chain(*X_index_l)))

    X_data_l = np.array(list(itertools.chain(*X_data_l)))

    if standardize: 
    
        X_data_l_std = np.array(list(itertools.chain(*X_data_l_std)))

        return X_data_l, X_data_l_std, X_index_l, GCM_records, scalers_dict
    
    else: 
        
        return X_data_l, X_index_l, GCM_records

In [39]:
rpath = set_root_dir(root='gdata')

In [40]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, scalers_dict = concat_GCMs(provider, GCMs, var_name='T2M', period='hindcasts', rpath=rpath, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)

getting ECMWF
/media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M
288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc
getting UKMO
/media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M
287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2016_12.nc
getting METEO_FRANCE
/media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/METEO_FRANCE/T2M
288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/METEO_FRANCE/T2M/METEO_FRANC

In [41]:
rpath = set_root_dir(root='local')

In [42]:
X_data_test, X_index_test, GCM_records_test = concat_GCMs(provider, GCMs, var_name='T2M', period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)

getting ECMWF
/home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M
36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc
getting UKMO
/home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M
28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2019_12.nc
getting METEO_FRANCE
/home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/METEO_FRANCE/T2M
36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/METEO_FRANCE/T2M/METEO_FRANCE_T2M_seasonal_anomalies_in

In [43]:
X_data_test.shape

(100, 4929)

In [44]:
X_data_train.shape

(863, 4929)

### apply the standard scalers to the test dataset 

In [45]:
scalers_dict

{'ECMWF': StandardScaler(copy=True, with_mean=True, with_std=True),
 'UKMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'METEO_FRANCE': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [46]:
X_data_test_std = []
for GCM in np.unique(GCM_records_test): 
    X_sub = X_data_test[GCM_records_test == GCM,:]
    X_sub_std = scalers_dict[GCM].transform(X_sub)
    X_data_test_std.append(X_sub_std)

### flatten

In [74]:
X_data_test_std = np.array(list(itertools.chain(*X_data_test_std)))

In [75]:
X_data_test_std.shape

(100, 4929)

## TARGETS 

In [47]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [48]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [49]:
targets = pd.concat(targets, axis=1)

In [50]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.278555,3,0.462528,16.051472,3,0.317965,16.732249,3,0.62154,13.811438,2,-0.093327,11.848419,1,-0.400334,13.728706,2,-0.222255
1979-04-30,16.794408,2,0.227319,14.586906,3,0.248038,14.953599,3,0.299057,12.18945,1,-0.249176,10.58958,2,-0.110858,12.033578,2,-0.208919
1979-05-31,14.695903,2,0.282907,12.52232,3,0.425773,12.716266,2,0.314655,9.888897,1,-0.215657,8.099501,1,-0.202497,9.232035,1,-0.470303
1979-06-30,12.093823,2,-0.001099,9.888909,2,0.117671,9.929897,1,-0.065854,7.19898,1,-0.300772,5.457298,1,-0.197458,6.634168,1,-0.254247
1979-07-31,10.290536,2,-0.061355,8.182231,2,0.120974,8.208954,2,-0.063564,5.534868,2,-0.058724,3.763353,2,0.085515,4.916423,2,0.112719


In [51]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [52]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [53]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [54]:
target_terciles.columns = target_terciles.columns.droplevel(1)

### target choice here (anomalies or tercile class, and region_name) 

In [55]:
if target_type == 'cat_3': 
    y = target_terciles.loc[:,region_name]
elif target_type == 'anomalies': 
    y = target_anomalies.loc[:,region_name]

In [56]:
print(f"\n\n\nNow going agead with TARGET {region_name}, {target_type} ------------------------ \n\n")




Now going agead with TARGET NNI, cat_3 ------------------------ 




### use the list of repeated index to select the target 

In [57]:
y_train = y.loc[X_index_train]

In [58]:
y_train.shape

(863,)

In [59]:
X_index_test.shape

(100,)

In [60]:
y_test = y.reindex(X_index_test)

In [61]:
y_test.shape

(100,)

### Now randomize 

#### get the shuffled indices 

In [76]:
np.random.seed(42)

In [77]:
shuffled_train_indexes = np.random.randint(0, len(X_index_train), len(X_index_train) + 1)

In [78]:
shuffled_test_indexes = np.random.randint(0, len(X_index_test), len(X_index_test) + 1)

#### apply the shuffled indices 

In [79]:
X_data_train_std_shuffled = X_data_train_std[shuffled_train_indexes,:]

In [80]:
X_data_test_std_shuffled = X_data_test_std[shuffled_test_indexes,:]

In [82]:
y_train_shuffled = y_train[shuffled_train_indexes]

In [83]:
y_test_shuffled = y_test[shuffled_test_indexes]

### pipeline 

In [None]:
from sklearn.pipeline import make_pipeline

--- 
---
---

### TARGETS 

In [None]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

#### RAINFALL 

In [None]:
target_var = 'TMEAN'

In [None]:
targets = []
for region_name in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / region_name / f'TS_NZ_region_{region_name}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[region_name],target.columns])
    targets.append(target)

In [None]:
targets = pd.concat(targets, axis=1)

In [None]:
targets.head()

In [None]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [None]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [None]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [None]:
targets_anomalies.corr()

In [None]:
target_terciles.columns = target_terciles.columns.droplevel(1)

In [None]:
target_terciles.corr()

In [None]:
f, ax = plt.subplots()
targets_anomalies.NNI.plot(ax=ax, lw=2)
ax.grid(ls=':', color='w')

### reduce the dimensionality of the hindcasts / forecasts using PCA 

### Get the data in a numpy array 

#### loads in case the underlying data structures are dask arrays 

In [None]:
dset_t2m_ecmwf_hindcasts.load()

In [None]:
X_t2m_train = dset_t2m_ecmwf_hindcasts['t2m'].data

### initialise the standard scaler 

In [None]:
scaler_t2m = StandardScaler()

### fit and transform 

In [None]:
X_t2m_train = scaler_t2m.fit_transform(X_t2m_train)

### verify that mean ~= 0 and std ~= 1 for all features (grid points)

In [None]:
X_t2m_train.mean(0)

In [None]:
X_t2m_train.std(0)

### initialise the PCA, 

#### percentage of variance we want to keep, scikit - learn will automatically select the number of PCs 

In [None]:
percent_variance = 0.8

#### initialisation 

In [None]:
skpca_t2m = pca.PCA(n_components=percent_variance)

#### fit AND transform, returns the PCs 

In [None]:
skpca_t2m_PCs_train = skpca_t2m.fit_transform(X_t2m_train)

### shape, number of pcs, do not forget that the 'member' dimension is also included in the z stacked dimension (member, lat, lon)

In [None]:
skpca_t2m_PCs_train.shape

In [None]:
n_pcs = skpca_t2m_PCs_train.shape[1]

In [None]:
print(n_pcs)

### gets the EOFs 

In [None]:
eofs_t2m_train = skpca_t2m.components_

In [None]:
eofs_t2m_train.shape

### reshape

In [None]:
dset_t2m_ecmwf_hindcasts.coords['z']

### Now calculate (project) the corresponding PCs in the forecast period

#### prior to that, transform using the Standard Scaler fitted over the hindcast period  

In [None]:
X_t2m_test =  dset_t2m_ecmwf_forecasts['t2m'].data

In [None]:
X_t2m_test = scaler_t2m.transform(X_t2m_test)

#### checks that the mean and the std are not too far off 0 and 1 respectively, note that due to temperarture trends, we expect an increase in the mean 

In [None]:
X_t2m_test.mean()

In [None]:
X_t2m_test.std()

#### now transforms using the pca object fitted previously on the training data 

In [None]:
skpca_t2m_PCs_test = skpca_t2m.transform(X_t2m_test)

### plots the PCs, casts these into a dataframe, with the correct time index 

In [None]:
df_skpca_t2m_PCs_train = pd.DataFrame(skpca_t2m_PCs_train, index=dset_t2m_ecmwf_hindcasts['time'].to_index())

In [None]:
df_skpca_t2m_PCs_test = pd.DataFrame(skpca_t2m_PCs_test, index=dset_t2m_ecmwf_forecasts['time'].to_index())

In [None]:
df_skpca_t2m_PCs_train.loc[:,0:10].plot(legend=None); 

In [None]:
df_skpca_t2m_PCs_test.loc[:,0:10].plot(legend=None); 

In [None]:
eofs_t2m_train.shape

In [None]:
coords_hindcasts.dims

In [None]:
eofs_t2m_train = eofs_t2m_train.reshape((n_pcs, coords_hindcasts.dims['member'], coords_hindcasts.dims['lat'], coords_hindcasts.dims['lon']))

In [None]:
eofs_t2m_train.shape

### put this into a dataset with the right dimensions

In [None]:
d = {}
d['pc'] = (('pc'), np.arange(n_pcs))
d['member'] = coords_hindcasts['member']
d['lat'] = coords_hindcasts['lat']
d['lon'] = coords_hindcasts['lon'] 
d['eof'] = (('pc','member','lat','lon'), eofs_t2m_train)

In [None]:
eofs_train_dset = xr.Dataset(d)

### plots all the EOFs along the member dimension 

In [None]:
eofs_train_dset.sel(pc=0)['eof'].plot(x='lon',y='lat', col='member', col_wrap=5, add_colorbar=False)

### Principal component associated with the first EOF 

In [None]:
df_skpca_t2m_PCs_train.loc[:,0].plot()

In [None]:
target_terciles

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [None]:
from sklearn import metrics

In [None]:
acc_score = {}
f1_score  = {}


for region_name in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    
    train = df_skpca_t2m_PCs_train.copy()
    train = train.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)
    
    test = df_skpca_t2m_PCs_test.copy()
    test = test.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)
    
    X_train = train.iloc[:,:-1].values
    y_train = train.iloc[:,-1].values
    
    X_test = test.iloc[:,:-1].values
    y_test = test.iloc[:,-1].values
    
    knn = KNN(n_neighbors=5, metric='minkowski', weights='distance', p=1)
    
    knn.fit(X_train, y_train)
    
    acc_score[region_name] = knn.score(X_test, y_test)
    
    y_test_pred_prob = knn.predict_proba(X_test)
    
    y_test_pred = knn.predict(X_test)
    
    f1_score[region_name] = metrics.f1_score(y_test, y_test_pred, average=None)

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
acc_score_dict = {}

for region_name in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    
    train = df_skpca_t2m_PCs_train.copy()
    train = train.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)

    test = df_skpca_t2m_PCs_test.copy()
    test = test.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)

    X_train = train.iloc[:,:-1].values
    y_train = train.iloc[:,-1].values

    X_test = test.iloc[:,:-1].values
    y_test = test.iloc[:,-1].values
    
    X = np.concatenate((X_train, X_test), axis=0)
    y = np.concatenate((y_train, y_test), axis=0)
    
    skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    
    skf.get_n_splits(X, y)
    
    acc_score = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        knn = KNN(n_neighbors=1, metric='minkowski', weights='distance', p=3)

        knn.fit(X_train, y_train)

        acc_score.append(knn.score(X_test, y_test))
    
    acc_score_dict[region_name] = np.array(acc_score) 
    

In [None]:
acc_score_df = pd.DataFrame(acc_score_dict)

In [None]:
acc_score_df.describe().loc[['min','mean','max']]

In [None]:
region_name = 'NNI'

In [None]:
train = df_skpca_t2m_PCs_train.copy()
train = train.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)

test = df_skpca_t2m_PCs_test.copy()
test = test.merge(target_terciles.loc[:,region_name], left_index=True, right_index=True)

X_train = train.iloc[:,:-1].values
y_train = train.iloc[:,-1].values

X_test = test.iloc[:,:-1].values
y_test = test.iloc[:,-1].values

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)


In [None]:
X.shape


In [None]:
y.shape

In [None]:
import autogluon as ag
from autogluon import TabularPrediction as task

In [None]:
df = pd.DataFrame(X)

In [None]:
df.loc[:,'y'] = y

In [None]:
df.info()

In [None]:
train_data = df.iloc[:270,:]

In [None]:
test_data = df.iloc[270:,:]

In [None]:
train_data.shape[0] + test_data.shape[0]

In [None]:
train_data = task.Dataset(train_data)

In [None]:
train_data.columns