In [1]:
# Parameters 

provider = 'CDS'
var_X = 't2m'
domain = 'ext_regional'
target_var = 'TMEAN'
target_type = 'cat_3'
region_name = 'NNI'

### load external modules 

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
from shutil import copytree, rmtree
import itertools

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
from sklearn.preprocessing import StandardScaler

### load local modules 

In [11]:
sys.path.append('../../../ml4seas/')

In [12]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time, concat_GCMs

### domain definitions 

In [13]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### set the root path for the `data` folder 

In [14]:
rpath = set_root_dir(root='gdata')

In [15]:
GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'DWD', 'CMCC', 'NCEP_CFSv2', 'CanCM4i', 'GEM_NEMO', 'NASA_GEOSS2S', 'CanSIPSv2']

In [16]:
step = 3

In [17]:
rpath = set_root_dir(root='gdata')

In [18]:
X_data_train, X_data_train_std, X_index_train, GCM_records_train, scalers_dict = concat_GCMs(GCMs, var_name=var_X.upper(), period='hindcasts', rpath=rpath, domain='ext_regional', standardize=True, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M
number of files in the archive: 288
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting UKMO
reading files from /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M
number of files in the archive: 287
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_1993_02.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2016_12.nc

-----------------   getting METEO_FRANCE
reading files from /media/nicolasf/GDATA/END

In [19]:
rpath = set_root_dir(root='local')

In [20]:
X_data_test, X_index_test, GCM_records_test = concat_GCMs(GCMs, var_name=var_X.upper(), period='forecasts', rpath=rpath, domain='ext_regional', standardize=False, flatten=True, ensmean=True, step=3)


-----------------   getting ECMWF
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M
number of files in the archive: 36
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting UKMO
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M
number of files in the archive: 28
first file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2017_09.nc
last file is /home/nicolasf/research/Smart_Ideas/data/GCMs/processed/forecasts/CDS/UKMO/T2M/UKMO_T2M_seasonal_anomalies_interp_2019_12.nc

-----------------   getting METEO_FRANCE
reading files from /home/nicolasf/research/Smart_Ideas/data/GCMs

In [21]:
X_data_test.shape

(300, 4929)

In [22]:
X_data_train.shape

(3526, 4929)

## TARGETS 

In [23]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [24]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [25]:
targets = pd.concat(targets, axis=1)

In [26]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.278555,3,0.462528,16.051472,3,0.317965,16.732249,3,0.62154,13.811438,2,-0.093327,11.848419,1,-0.400334,13.728706,2,-0.222255
1979-04-30,16.794408,2,0.227319,14.586906,3,0.248038,14.953599,3,0.299057,12.18945,1,-0.249176,10.58958,2,-0.110858,12.033578,2,-0.208919
1979-05-31,14.695903,2,0.282907,12.52232,3,0.425773,12.716266,2,0.314655,9.888897,1,-0.215657,8.099501,1,-0.202497,9.232035,1,-0.470303
1979-06-30,12.093823,2,-0.001099,9.888909,2,0.117671,9.929897,1,-0.065854,7.19898,1,-0.300772,5.457298,1,-0.197458,6.634168,1,-0.254247
1979-07-31,10.290536,2,-0.061355,8.182231,2,0.120974,8.208954,2,-0.063564,5.534868,2,-0.058724,3.763353,2,0.085515,4.916423,2,0.112719


In [27]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [28]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [29]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [30]:
target_terciles.columns = target_terciles.columns.droplevel(1)

### target choice here (anomalies or tercile class, and region_name) 

In [31]:
if target_type == 'cat_3': 
    y = target_terciles.loc[:,region_name]
elif target_type == 'anomalies': 
    y = target_anomalies.loc[:,region_name]

In [32]:
print(f"\n\n\nNow going agead with TARGET {region_name}, {target_type} ------------------------ \n\n")




Now going agead with TARGET NNI, cat_3 ------------------------ 




### use the list of repeated index to select the target 

In [33]:
y_train = y.loc[X_index_train]

In [34]:
y_train.shape

(3526,)

In [35]:
X_index_test.shape

(300,)

In [36]:
y_test = y.reindex(X_index_test)

In [37]:
y_test.shape

(300,)

#### get the shuffled indices 

In [38]:
np.random.seed(42)

### KNN now with stratified k fold 

In [39]:
from sklearn.neighbors import KNeighborsClassifier as KNN

### simulate a model that is re-trained every month 

In [40]:
X_data_train.shape

(3526, 4929)

In [41]:
y_test.shape

(300,)

In [42]:
X_mat_train = X_data_train.copy()

In [43]:
X_mat_test = X_data_test.copy()

In [44]:
len(X_mat_test)

300

In [45]:
len(y_test)

300

In [46]:
y_mat_train = y_train.values.copy()

In [47]:
y_mat_train.shape

(3526,)

In [48]:
y_test.shape

(300,)

In [49]:
X_mat_test = X_mat_test[~np.isnan(y_test)]

In [50]:
X_mat_test.shape

(270, 4929)

In [51]:
y_test = y_test[~np.isnan(y_test)]

In [52]:
y_hat = []

In [53]:
n_neighbors = 10

from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)   
from sklearn.datasets import load_iris  
from sklearn.model_selection import train_test_split  
from sklearn.pipeline import Pipeline  
X, y = load_iris(return_X_y=True)   
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)  
nca = NeighborhoodComponentsAnalysis(random_state=42)  
knn = KNeighborsClassifier(n_neighbors=3)   nca_pipe = Pipeline([('nca', nca), ('knn', knn)])  
nca_pipe.fit(X_train, y_train)  
print(nca_pipe.score(X_test, y_test))  

### note: need to only train on unique dates ! 

In [54]:
len(y_test.index.drop_duplicates())

33

In [55]:
unique_dates = y_test.index.drop_duplicates()

In [56]:
unique_dates = unique_dates.sort_values()

In [57]:
dates_index_ytest = pd.Series(np.arange(len(y_test)), index=y_test.index)

In [58]:
y_test.dropna().shape

(270,)

In [59]:
np.isnan(y_mat_train).sum()

0

In [60]:
for date in unique_dates: 
    
    pos_index = dates_index_ytest.loc[date,].values
    
    scaler = StandardScaler()
    
    scaler.fit(X_mat_train)
    
    X_mat_train_std = scaler.transform(X_mat_train)
    
    X_mat_test_std = scaler.transform(X_mat_test[pos_index,:])
    
    knn = KNN(n_neighbors=n_neighbors, metric='minkowski', weights='distance', p=3, n_jobs=-1)
    
    knn.fit(X_mat_train_std[~np.isnan(y_mat_train),:], y_mat_train[~np.isnan(y_mat_train),])
    
    y_hat.append(knn.predict(X_mat_test_std).tolist())
    
    X_mat_train = np.r_[X_mat_train, X_mat_test[pos_index,:]]
        
    y_mat_train = np.append(y_mat_train, y_test[pos_index]) 
    
    

In [61]:
y_hat_flat  = list(itertools.chain(*y_hat))

In [62]:
y_hat_flat = np.array(y_hat_flat).astype(np.int32)

In [63]:
len(y_hat_flat)

270

In [64]:
len(y_test)

270

In [65]:
y_all = y_test.to_frame(name='y')

In [66]:
y_all.loc[:,'y_hat'] = y_hat_flat

In [67]:
y_all

Unnamed: 0_level_0,y,y_hat
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-04-30,3.0,3
2017-05-31,3.0,1
2017-06-30,3.0,3
2017-07-31,2.0,3
2017-08-31,3.0,3
...,...,...
2019-01-31,3.0,3
2019-02-28,3.0,3
2019-03-31,3.0,3
2019-11-30,3.0,3


In [68]:
y_all.index.unique()[0]

Timestamp('2017-04-30 00:00:00')

In [69]:
date =  y_all.index.unique()[0]

In [70]:
y_all.loc[date,:].mode()

Unnamed: 0,y,y_hat
0,3.0,3


In [71]:
maj_ = []
for date in y_all.index.unique(): 
    print(f"{date:%Y-%m} number of available GCMs: {len(y_all.loc[date,:])}")
    maj_.append(y_all.loc[date,:].mode())

2017-04 number of available GCMs: 8
2017-05 number of available GCMs: 7
2017-06 number of available GCMs: 7
2017-07 number of available GCMs: 7
2017-08 number of available GCMs: 7
2017-09 number of available GCMs: 7
2017-10 number of available GCMs: 7
2017-11 number of available GCMs: 7
2017-12 number of available GCMs: 8
2018-01 number of available GCMs: 8
2018-02 number of available GCMs: 9
2018-03 number of available GCMs: 9
2018-04 number of available GCMs: 9
2018-05 number of available GCMs: 9
2018-06 number of available GCMs: 9
2018-07 number of available GCMs: 9
2018-08 number of available GCMs: 9
2018-09 number of available GCMs: 9
2018-10 number of available GCMs: 9
2018-11 number of available GCMs: 9
2018-12 number of available GCMs: 9
2019-01 number of available GCMs: 9
2019-02 number of available GCMs: 10
2019-03 number of available GCMs: 10
2019-04 number of available GCMs: 7
2019-05 number of available GCMs: 7
2019-06 number of available GCMs: 7
2019-07 number of availabl

In [72]:
maj = pd.concat(maj_)

In [73]:
np.round((maj.y_hat.values == maj.y.values).sum() / len(maj), 2)

0.91

In [74]:
region_name

'NNI'