In [1]:
# Parameters 

provider = 'CDS'
var_X = 'precip'
domain = 'ext_regional'
target_var = 'RAIN'
target_type = 'cat_3'
region_name = 'WNI'

### load external modules 

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
from shutil import copytree, rmtree
import itertools

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

### load local modules 

In [11]:
sys.path.append('../../../ml4seas/')

In [12]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time

### domain definitions 

In [13]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### set the root path for the `data` folder 

In [14]:
rpath = set_root_dir(root='gdata')

In [15]:
provider = 'CDS'

In [16]:
GCMs = ['ECMWF','UKMO','METEO_FRANCE','DWD','CMCC']
# GCMs = ['ECMWF']

In [17]:
step = 3

In [18]:
# %%writefile ../../../ml4seas/GCM/concat_GCMs.py 
def concat_GCMs(provider, GCMs, var_name='T2M', period='hindcasts', rpath=None, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3): 
    """
    Returns many GCM outputs concatenated along the time dimension
    
    Parameters
    ----------
    
    - provider : str, the provider in ['CDS','IRI','JMA'], no default 
    - GCMs : list, a list of GCMs in the provider 
    - period : the period to extract, in ['hindcasts','forecasts']
    - rpath : str or pathlib.Path, the path to the 'data' folder 
    - domain : the domain, in ['local','regional','ext_regional', 'global', 'tropics']
    - standardize : Boolean, must be True for 'hindcasts', False for 'forecasts'
    - flatten : Boolean, whether or not to flatten the outputs along the spatial (and optionally members) dimension, default to True
    - ensmean : Boolean, whether or not to calculate the ensemble mean, default to True
    - step : the number of step by which to shift the time index, to align with observed target, default to 3 (assumes seasonal anomalies)
    
    Return
    ------
    
    - X_data_l : numpy.array containing the data concatenated along the time dimension (axis=0)
    - X_data_l_std : if standardized=True, numpy.array containing the standardized data concatenated along the time dimension (axis=0)
    - X_index_l :  numpy.array of Python datatimes, containing the index (note that repeated values will be present)
    - GCM_records : numpy.array of len(X_index_l) containing the string for the corresponding GCM
    - scalers_dict : if standardized=True, dictionnary, with each item (key = GCM) corresponding to fitted scikit-learn StandardScaler() object
    
    
    """
    
    import sys
    import pathlib
    import itertools
    import numpy as np
    
    HOME = pathlib.Path.home()
    
    sys.path.append(HOME / 'research' / 'Smart_Ideas' / 'code' / 'ml4seas')
    
    from utils import set_root_dir
    from GCM import get_GCM_outputs, shift_dset_time
    
    GCM_records = []
    X_index_l = []
    X_data_l = []
    
    if standardize:
        X_data_l_std = []

    domain_def = {}
    domain_def['local'] = [150, 200, -50, -10]
    domain_def['regional'] = [90, 300, -65, 50]
    domain_def['ext_regional'] = [70, 300, -70, 60]
    # domain_def['ext_regional'] = [50, 300, -75, 60]
    domain_def['global'] = [0, 360, -70, 70]
    domain_def['tropics'] = [0, 360, -40, 40]    

    if standardize: 
        
        scalers_dict = {}
    
    if isinstance(rpath, str): 
        rpath = pathlib.Path(rpath)
    
    for GCM in GCMs: 
    
        print(f"\n-----------------   getting {GCM}")
    
        dset, coords = get_GCM_outputs(provider=provider, GCM=GCM, var_name=var_name, period=period, rpath=rpath, domain=domain_def[domain], step=step, flatten=flatten, ensmean=ensmean)
        
        if 'valid_time' in dset.coords: 
            dset = dset.drop('valid_time')        
            
        dset = shift_dset_time(dset, step=step)
        
        X_data = dset[var_name.lower()].data
        
        X_index = dset['time'].to_index().to_pydatetime()
        
        if standardize: 
        
            scaler = StandardScaler() 

            scaler = scaler.fit(X_data)

            scalers_dict[GCM] = scaler

            X_data_std = scaler.transform(X_data)

        # append and records 
        
        GCM_records.append(np.repeat([GCM], len(X_index)))
        
        X_index_l.append(X_index)
        
        X_data_l.append(X_data)

        if standardize: 
        
            X_data_l_std.append(X_data_std)
        
    GCM_records = np.array(list(itertools.chain(*GCM_records)))

    X_index_l = np.array(list(itertools.chain(*X_index_l)))

    X_data_l = np.array(list(itertools.chain(*X_data_l)))

    if standardize: 
    
        X_data_l_std = np.array(list(itertools.chain(*X_data_l_std)))

        return X_data_l, X_data_l_std, X_index_l, GCM_records, scalers_dict
    
    else: 
        
        return X_data_l, X_index_l, GCM_records

In [19]:
rpath = set_root_dir(root='gdata')

In [20]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, scalers_dict = concat_GCMs(provider, GCMs, var_name=var_X.upper(), period='hindcasts', rpath=rpath, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP
number of files in the archive: 288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2016_12.nc

-----------------   getting UKMO
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP
number of files in the archive: 287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2016_12.nc

-----------------   getting METEO_FRANCE
reading files 

In [21]:
rpath = set_root_dir(root='local')

In [22]:
X_data_test, X_index_test, GCM_records_test = concat_GCMs(provider, GCMs, var_name=var_X.upper(), period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP
number of files in the archive: 36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2019_12.nc

-----------------   getting UKMO
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP
number of files in the archive: 28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/PRECIP/UKMO_PRECIP_seasonal_anomalies_interp_2019_12.nc

-----------------   getting METEO_FRANCE
reading files from /home/nicolasf/

In [23]:
X_data_test.shape

(150, 4929)

In [24]:
X_data_train.shape

(1439, 4929)

### apply the standard scalers to the test dataset 

In [25]:
scalers_dict

{'ECMWF': StandardScaler(copy=True, with_mean=True, with_std=True),
 'UKMO': StandardScaler(copy=True, with_mean=True, with_std=True),
 'METEO_FRANCE': StandardScaler(copy=True, with_mean=True, with_std=True),
 'DWD': StandardScaler(copy=True, with_mean=True, with_std=True),
 'CMCC': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [26]:
X_data_test_std = []
for GCM in np.unique(GCM_records_test): 
    X_sub = X_data_test[GCM_records_test == GCM,:]
    X_sub_std = scalers_dict[GCM].transform(X_sub)
    X_data_test_std.append(X_sub_std)

### flatten

In [27]:
X_data_test_std = np.array(list(itertools.chain(*X_data_test_std)))

In [28]:
X_data_test_std.shape

(150, 4929)

## TARGETS 

In [29]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [30]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [31]:
targets = pd.concat(targets, axis=1)

In [32]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies,Rain_bc,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,470.822673,3,176.364977,357.294404,3,49.163323,472.045206,3,180.723225,498.174979,3,66.560252,783.318801,3,143.904216,221.08315,3,49.95124
1979-04-30,546.72,3,229.631158,440.482174,3,121.697362,492.918286,3,181.818099,564.466858,3,129.367183,690.960327,3,94.298941,215.2389,3,51.450578
1979-05-31,493.002426,3,134.029467,523.61263,3,167.590461,514.622032,3,174.868212,652.528876,3,163.604985,730.426131,3,78.714666,294.27635,3,126.515812
1979-06-30,429.186337,2,13.302417,404.922302,2,-9.359534,306.615937,1,-71.251663,579.009926,2,25.933292,662.720245,2,1.927881,165.33315,2,-0.559937
1979-07-31,549.57599,3,77.774812,407.60008,1,-53.43867,368.72946,1,-65.591395,602.019278,2,18.614145,606.331144,2,-34.887512,187.18665,3,12.60062


In [33]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [34]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [35]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [36]:
target_terciles.columns = target_terciles.columns.droplevel(1)

### target choice here (anomalies or tercile class, and region_name) 

In [37]:
if target_type == 'cat_3': 
    y = target_terciles.loc[:,region_name]
elif target_type == 'anomalies': 
    y = target_anomalies.loc[:,region_name]

In [38]:
print(f"\n\n\nNow going agead with TARGET {region_name}, {target_type} ------------------------ \n\n")




Now going agead with TARGET WNI, cat_3 ------------------------ 




### use the list of repeated index to select the target 

In [39]:
y_train = y.loc[X_index_train]

In [40]:
y_train.shape

(1439,)

In [41]:
X_index_test.shape

(150,)

In [42]:
y_test = y.reindex(X_index_test)

In [43]:
y_test.shape

(150,)

### Now randomize 

#### get the shuffled indices 

In [44]:
np.random.seed(42)

In [45]:
shuffled_train_indexes = np.random.randint(0, len(X_data_train_std), len(X_data_train_std))

In [46]:
shuffled_test_indexes = np.random.randint(0, len(X_data_test_std), len(X_data_test_std))

#### apply the shuffled indices 

In [47]:
X_data_train_std_shuffled = X_data_train_std[shuffled_train_indexes,:]

In [48]:
X_data_test_std_shuffled = X_data_test_std[shuffled_test_indexes,:]

In [49]:
y_train_shuffled = y_train[shuffled_train_indexes]

In [50]:
y_test_shuffled = y_test[shuffled_test_indexes]

In [51]:
shuffled_test_indexes.shape

(150,)

### KNN now with stratified k fold 

In [52]:
from sklearn.model_selection import StratifiedKFold

In [53]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [54]:
X = X_data_train_std_shuffled
y = y_train_shuffled

In [55]:
X.shape

(1439, 4929)

In [56]:
y.shape

(1439,)

In [57]:
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

skf.get_n_splits(X, y)

10

In [58]:
n_neighbors = 1

In [59]:
acc_score = []

for train_index, test_index in skf.split(X, y):
    
    X_tr, X_te = X[train_index], X[test_index]
    
    y_tr, y_te = y[train_index], y[test_index]

    knn = KNN(n_neighbors=n_neighbors, metric='minkowski', weights='distance', p=3, n_jobs=-1)

    knn.fit(X_tr, y_tr)

    acc_score.append(knn.score(X_te, y_te))

In [60]:
acc_score

[0.8275862068965517,
 0.8896551724137931,
 0.8758620689655172,
 0.8344827586206897,
 0.8055555555555556,
 0.8251748251748252,
 0.8671328671328671,
 0.8601398601398601,
 0.8671328671328671,
 0.7902097902097902]

In [61]:
knn = KNN(n_neighbors=n_neighbors, metric='minkowski', weights='distance', p=3, n_jobs=-1)

In [62]:
knn.fit(X_data_train_std_shuffled, y_train_shuffled)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=1, p=3,
                     weights='distance')

In [63]:
y_test_pred = knn.predict(X_data_test_std_shuffled)

In [64]:
y_test_pred_probs = knn.predict_proba(X_data_test_std_shuffled)

In [65]:
y_test_pred.shape

(150,)

In [66]:
y_test_shuffled

time
2019-08-31    2.0
2019-10-31    1.0
2017-12-31    1.0
2019-05-31    2.0
2018-10-31    1.0
             ... 
2020-01-31    NaN
2019-08-31    2.0
2018-06-30    3.0
2017-06-30    3.0
2019-07-31    1.0
Name: WNI, Length: 150, dtype: float64

In [67]:
y_test_shuffled = y_test_shuffled.to_frame()

In [68]:
y_test_shuffled.loc[:,'y_hat'] = y_test_pred

In [69]:
y_test_shuffled = y_test_shuffled.dropna(axis=0)

In [70]:
(y_test_shuffled.loc[:,region_name].values == y_test_shuffled.y_hat.values).sum() / len(y_test_shuffled)

0.36231884057971014

### simulate a model that is re-trained every month 

In [71]:
X_data_train.shape

(1439, 4929)

In [72]:
y_test.shape

(150,)

In [73]:
X_mat_train = X_data_train.copy()

In [74]:
X_mat_test = X_data_test.copy()

In [75]:
len(X_mat_test)

150

In [76]:
len(y_test)

150

In [77]:
y_mat_train = y_train.values.copy()

In [78]:
y_mat_train.shape

(1439,)

In [79]:
X_mat_train.shape

(1439, 4929)

In [80]:
y_test.shape

(150,)

In [81]:
y_hat = []

In [82]:
y_test

time
2017-04-30    3.0
2017-05-31    3.0
2017-06-30    3.0
2017-07-31    1.0
2017-08-31    2.0
             ... 
2019-11-30    1.0
2019-12-31    1.0
2020-01-31    NaN
2020-02-29    NaN
2020-03-31    NaN
Name: WNI, Length: 150, dtype: float64

In [83]:
X_mat_test = X_mat_test[~np.isnan(y_test)]

In [84]:
X_mat_test.shape

(135, 4929)

In [85]:
y_test = y_test[~np.isnan(y_test)]

In [86]:
y_hat = []
y_hat_probs = []

In [None]:
for i in range(len(y_test)): 
    
    scaler = StandardScaler()
    
    scaler.fit(X_mat_train)
    
    X_mat_train_std = scaler.transform(X_mat_train)
    
    X_mat_test_std = scaler.transform(X_mat_test[i,:].reshape(1, -1))
    
    knn = KNN(n_neighbors=n_neighbors, metric='minkowski', weights='distance', p=3, n_jobs=-1)
    
    knn.fit(X_mat_train_std, y_mat_train)
    
    y_hat.append(knn.predict(X_mat_test_std))
    
    y_hat_probs.append(knn.predict_proba(X_mat_test_std))
    
    X_mat_train = np.r_[X_mat_train, X_mat_test[i,:].reshape(1,-1)]
    
    y_mat_train = np.append(y_mat_train, y_test[i])    

In [None]:
y_mat_train.shape

In [None]:
y_hat = np.array(y_hat)

In [None]:
y_hat_probs = np.array(y_hat_probs)

In [None]:
len(y_test)

In [None]:
y_test

In [None]:
y_all = y_test.to_frame(name='y')

In [None]:
y_all.loc[:,'y_hat'] = y_hat

In [None]:
# y_all.loc[:,'y_hat_probs'] = y_hat_probs

In [None]:
(y_all.y_hat.values == y_all.y.values).sum() / len(y_hat)

In [None]:
y_all.index.unique()[0]

In [None]:
date =  y_all.index.unique()[0]

In [None]:
y_all.loc[date,:].mode()

In [None]:
maj_ = []
for date in y_all.index.unique(): 
    print(f"{date:%Y-%m} number of available GCMs: {len(y_all.loc[date,:])}")
    maj_.append(y_all.loc[date,:].mode())

In [None]:
maj = pd.concat(maj_)

In [None]:
(maj.y_hat.values == maj.y.values).sum() / len(maj)