### use AUTOGLUON with PCS predictores derived from the regional domain 

In [None]:
# Parameters 

provider = 'CDS'
GCM = 'ECMWF'
var_X = 't2m'
target_var = 'TMEAN'
target_type = 'cat_3'
region_name = 'ESI'
domain = 'ext_regional'
skpca = True 
kernel_PCA = False

### load external modules 

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
from shutil import copytree, rmtree

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
from sklearn.decomposition import PCA, KernelPCA
from mlxtend.feature_extraction import RBFKernelPCA as KPCA

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
import autogluon as ag
from autogluon import TabularPrediction as task

  Optimizer.opt_registry[name].__name__))


### load local modules 

In [13]:
sys.path.append('../../../../ml4seas/')

In [14]:
from utils import set_root_dir
from GCM import get_GCM_outputs, shift_dset_time

### set the root path for the `data` folder 

In [15]:
rpath = set_root_dir(root='gdata')

### define the geographical domain here 

In [16]:
domain_def = {}
domain_def['local'] = [150, 200, -50, -10]
domain_def['regional'] = [90, 300, -65, 50]
domain_def['ext_regional'] = [70, 300, -70, 60]
# domain_def['ext_regional'] = [50, 300, -75, 60]
domain_def['global'] = [0, 360, -70, 70]
domain_def['tropics'] = [0, 360, -40, 40]

### get the HINDCASTS

In [17]:
if var_X == 't2m': 
    
    # get the hindcasts 
    dset_hindcasts, coords_hindcasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='T2M', period='hindcasts', \
                                                             rpath=rpath, domain=domain_def[domain], step=3, flatten=True, ensmean=True)
    # get the forecasts 
    dset_forecasts, coords_forecasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='T2M', period='forecasts', rpath=rpath, \
                                                             domain=domain_def[domain], step=3, \
                                                             flatten=True, ensmean=True)
    
    # remove the 'valid_time' if present 

elif var_X == 'precip': 
    
    # get the hindcasts 
    dset_hindcasts, coords_hindcasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='PRECIP', period='hindcasts', \
                                                             rpath=rpath, domain=domain_def[domain], step=3, flatten=True, ensmean=True)
    # get the forecasts 
    dset_forecasts, coords_forecasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='PRECIP', period='forecasts', rpath=rpath, \
                                                             domain=domain_def[domain], step=3, \
                                                             flatten=True, ensmean=True)    
    

first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc
first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc


### delete valid time if present 

In [18]:
if 'valid_time' in dset_hindcasts.coords: 
    dset_hindcasts = dset_hindcasts.drop('valid_time')

In [19]:
if 'valid_time' in dset_forecasts.coords: 
    dset_forecasts = dset_forecasts.drop('valid_time')

### now shifts the time index so that the time corresponds to the time of the forecast, not the initialisation time 

### also shifts to the end of the month, to correspond to the convention used in the target time-series 

In [20]:
step = 3

In [21]:
dset_hindcasts = shift_dset_time(dset_hindcasts)

In [22]:
dset_forecasts = shift_dset_time(dset_forecasts)

### concatenate the training (hindcast) and test (forecast) data in one dataset 

In [23]:
dset = xr.concat([dset_hindcasts, dset_forecasts], dim='time')

## SELECT THE FEATURES (X) HERE 

In [24]:
var_X

't2m'

In [25]:
X = dset[var_X]

### get the time index 

In [26]:
time = X.time.to_index()

In [27]:
X_data = X.data

In [28]:
X_data.shape

(324, 4929)

In [29]:
X_df = pd.DataFrame(X_data, index=time)

In [30]:
X_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4919,4920,4921,4922,4923,4924,4925,4926,4927,4928
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-30,-0.5173,-0.399448,-0.249364,-0.185059,-0.182984,-0.20124,-0.213211,-0.169383,-0.127029,-0.070253,...,-0.871117,-0.857549,-0.929712,-0.921326,-0.923134,-0.921653,-0.788212,-0.832896,-0.759973,-1.476434
1993-05-31,-0.597781,-0.561207,-0.202372,-0.178136,-0.143629,-0.127254,-0.124171,-0.106091,-0.120592,-0.128927,...,-0.506239,-0.473424,-0.498079,-0.494656,-0.499675,-0.425676,-0.283974,-0.264218,-0.238144,-0.508478
1993-06-30,-0.520627,-0.545065,-0.34955,-0.291018,-0.192664,-0.157433,-0.151819,-0.108799,-0.132211,-0.158645,...,-0.670763,-0.764584,-0.991722,-1.08505,-1.047474,-0.842509,-0.708136,-0.910243,-0.444421,-0.222354
1993-07-31,-0.786031,-0.62355,-0.54254,-0.576415,-0.621286,-0.655927,-0.685573,-0.690092,-0.710755,-0.730662,...,-0.46775,-0.613002,-0.758718,-0.811393,-0.687868,-0.521426,-0.392894,-0.421498,-0.26317,0.035204
1993-08-31,0.455569,0.449003,0.428328,0.319741,0.291156,0.287573,0.276183,0.247528,0.199566,0.128505,...,-0.830166,-0.881459,-0.604143,-0.645533,-0.688021,-1.069209,-1.058966,-1.065983,-0.755628,-0.439965


In [31]:
X_df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4919,4920,4921,4922,4923,4924,4925,4926,4927,4928
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-30,1.126797,0.94542,0.94381,0.852602,0.881548,0.95002,0.984079,0.991816,1.001341,1.00344,...,0.423549,0.523386,0.501374,0.59299,0.677869,0.603356,0.539171,0.624312,0.657935,0.719989
2019-12-31,0.562338,0.543603,0.684447,0.701487,0.740093,0.766206,0.770201,0.770737,0.80299,0.804869,...,1.524933,1.514174,1.628201,1.942694,2.07466,1.859437,1.939649,1.975425,1.896868,1.541503
2020-01-31,0.186597,0.226923,0.169335,0.201899,0.245831,0.300053,0.33834,0.354314,0.369161,0.372428,...,2.531335,2.458195,2.285583,2.217966,2.248682,2.081639,2.485571,2.217479,2.203683,1.698307
2020-02-29,0.610172,0.909672,0.637504,0.650917,0.656561,0.650874,0.651702,0.617513,0.598879,0.572038,...,0.536517,0.531743,0.336273,0.168436,0.140415,0.402606,0.907908,0.635701,0.920844,1.08174
2020-03-31,0.517694,0.69266,0.556423,0.550429,0.517887,0.45933,0.41769,0.369154,0.335975,0.299681,...,0.695286,0.658589,0.660272,0.723318,0.698862,0.827824,0.864633,0.676458,0.747384,1.747413


### TARGETS 

In [32]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [33]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [34]:
targets = pd.concat(targets, axis=1)

In [35]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.278555,3,0.462528,16.051472,3,0.317965,16.732249,3,0.62154,13.811438,2,-0.093327,11.848419,1,-0.400334,13.728706,2,-0.222255
1979-04-30,16.794408,2,0.227319,14.586906,3,0.248038,14.953599,3,0.299057,12.18945,1,-0.249176,10.58958,2,-0.110858,12.033578,2,-0.208919
1979-05-31,14.695903,2,0.282907,12.52232,3,0.425773,12.716266,2,0.314655,9.888897,1,-0.215657,8.099501,1,-0.202497,9.232035,1,-0.470303
1979-06-30,12.093823,2,-0.001099,9.888909,2,0.117671,9.929897,1,-0.065854,7.19898,1,-0.300772,5.457298,1,-0.197458,6.634168,1,-0.254247
1979-07-31,10.290536,2,-0.061355,8.182231,2,0.120974,8.208954,2,-0.063564,5.534868,2,-0.058724,3.763353,2,0.085515,4.916423,2,0.112719


In [36]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [37]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [38]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [39]:
target_terciles.columns = target_terciles.columns.droplevel(1)

### target choice here (anomalies or tercile class, and region_name) 

In [40]:
if target_type == 'cat_3': 
    y = target_terciles.loc[:,region_name]
elif target_type == 'anomalies': 
    y = target_anomalies.loc[:,region_name]

In [41]:
print(f"\n\n\nNow going agead with TARGET {region_name}, {target_type} ------------------------ \n\n")




Now going agead with TARGET ESI, cat_3 ------------------------ 




In [42]:
X_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4919,4920,4921,4922,4923,4924,4925,4926,4927,4928
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-30,-0.5173,-0.399448,-0.249364,-0.185059,-0.182984,-0.20124,-0.213211,-0.169383,-0.127029,-0.070253,...,-0.871117,-0.857549,-0.929712,-0.921326,-0.923134,-0.921653,-0.788212,-0.832896,-0.759973,-1.476434
1993-05-31,-0.597781,-0.561207,-0.202372,-0.178136,-0.143629,-0.127254,-0.124171,-0.106091,-0.120592,-0.128927,...,-0.506239,-0.473424,-0.498079,-0.494656,-0.499675,-0.425676,-0.283974,-0.264218,-0.238144,-0.508478
1993-06-30,-0.520627,-0.545065,-0.34955,-0.291018,-0.192664,-0.157433,-0.151819,-0.108799,-0.132211,-0.158645,...,-0.670763,-0.764584,-0.991722,-1.08505,-1.047474,-0.842509,-0.708136,-0.910243,-0.444421,-0.222354
1993-07-31,-0.786031,-0.62355,-0.54254,-0.576415,-0.621286,-0.655927,-0.685573,-0.690092,-0.710755,-0.730662,...,-0.46775,-0.613002,-0.758718,-0.811393,-0.687868,-0.521426,-0.392894,-0.421498,-0.26317,0.035204
1993-08-31,0.455569,0.449003,0.428328,0.319741,0.291156,0.287573,0.276183,0.247528,0.199566,0.128505,...,-0.830166,-0.881459,-0.604143,-0.645533,-0.688021,-1.069209,-1.058966,-1.065983,-0.755628,-0.439965


In [43]:
region_name

'ESI'

In [44]:
X_df = X_df.merge(y, left_index=True, right_index=True)

In [45]:
X_df = X_df.dropna(axis=0)

In [46]:
X_df.columns

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
        4920,  4921,  4922,  4923,  4924,  4925,  4926,  4927,  4928, 'ESI'],
      dtype='object', length=4930)

In [47]:
X_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4920,4921,4922,4923,4924,4925,4926,4927,4928,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-30,-0.5173,-0.399448,-0.249364,-0.185059,-0.182984,-0.20124,-0.213211,-0.169383,-0.127029,-0.070253,...,-0.857549,-0.929712,-0.921326,-0.923134,-0.921653,-0.788212,-0.832896,-0.759973,-1.476434,1
1993-05-31,-0.597781,-0.561207,-0.202372,-0.178136,-0.143629,-0.127254,-0.124171,-0.106091,-0.120592,-0.128927,...,-0.473424,-0.498079,-0.494656,-0.499675,-0.425676,-0.283974,-0.264218,-0.238144,-0.508478,1
1993-06-30,-0.520627,-0.545065,-0.34955,-0.291018,-0.192664,-0.157433,-0.151819,-0.108799,-0.132211,-0.158645,...,-0.764584,-0.991722,-1.08505,-1.047474,-0.842509,-0.708136,-0.910243,-0.444421,-0.222354,2
1993-07-31,-0.786031,-0.62355,-0.54254,-0.576415,-0.621286,-0.655927,-0.685573,-0.690092,-0.710755,-0.730662,...,-0.613002,-0.758718,-0.811393,-0.687868,-0.521426,-0.392894,-0.421498,-0.26317,0.035204,3
1993-08-31,0.455569,0.449003,0.428328,0.319741,0.291156,0.287573,0.276183,0.247528,0.199566,0.128505,...,-0.881459,-0.604143,-0.645533,-0.688021,-1.069209,-1.058966,-1.065983,-0.755628,-0.439965,3


In [48]:
X_df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4920,4921,4922,4923,4924,4925,4926,4927,4928,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-08-31,0.945966,0.934315,0.793528,0.802044,0.818481,0.857217,0.843843,0.834872,0.823083,0.80811,...,-0.160997,-0.163018,0.380902,0.543976,0.60082,0.479194,0.804666,0.347579,-0.681886,3
2019-09-30,0.756698,0.731192,0.662941,0.674524,0.657537,0.669284,0.664474,0.652051,0.648556,0.598272,...,0.154891,-0.176855,-0.080172,0.042225,0.254937,0.779242,0.617168,0.644395,0.159071,3
2019-10-31,1.742619,1.996949,1.637812,1.579129,1.611831,1.696773,1.753081,1.780183,1.807559,1.826306,...,0.48733,0.486621,0.427051,0.527889,0.57312,0.634355,0.620671,0.691358,0.599412,1
2019-11-30,1.126797,0.94542,0.94381,0.852602,0.881548,0.95002,0.984079,0.991816,1.001341,1.00344,...,0.523386,0.501374,0.59299,0.677869,0.603356,0.539171,0.624312,0.657935,0.719989,3
2019-12-31,0.562338,0.543603,0.684447,0.701487,0.740093,0.766206,0.770201,0.770737,0.80299,0.804869,...,1.514174,1.628201,1.942694,2.07466,1.859437,1.939649,1.975425,1.896868,1.541503,3


### selects training and test set 

In [49]:
X_df_train = X_df.loc[:'2016',:]

In [50]:
X_df_test = X_df.loc['2017':,:]

In [51]:
X_df_train.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4920,4921,4922,4923,4924,4925,4926,4927,4928,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-08-31,-0.400886,-0.194855,-0.170755,-0.020751,0.055898,0.112672,0.138634,0.129549,0.108267,0.074436,...,-0.168795,-0.510476,-0.145394,-0.06374,0.074233,-0.146568,-0.292147,0.038938,0.103167,3
2016-09-30,-0.404111,-0.585235,-0.339035,-0.27794,-0.265629,-0.250613,-0.207535,-0.147685,-0.080453,-0.036055,...,-0.450894,-0.204202,0.052091,0.100562,-0.068979,-0.123225,0.049135,-0.113715,0.190757,3
2016-10-31,0.377656,0.456079,0.409681,0.465798,0.52149,0.593665,0.651101,0.670112,0.664056,0.649585,...,0.555458,0.756791,0.642701,0.580304,0.620254,0.448266,0.499852,0.277036,0.568906,2
2016-11-30,-0.010468,0.01188,0.211807,0.292387,0.405732,0.547811,0.684374,0.813348,0.905063,0.999274,...,0.43705,0.447105,0.348345,0.398957,0.418101,0.327886,0.317032,0.31223,0.559025,3
2016-12-31,0.349731,0.408009,0.305282,0.291499,0.280654,0.256358,0.22627,0.205999,0.200144,0.213224,...,1.047905,1.050957,1.085172,1.167425,1.114036,1.161851,1.175292,1.069569,0.952905,2


In [52]:
X_df_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4920,4921,4922,4923,4924,4925,4926,4927,4928,ESI
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-31,0.166416,0.143415,0.21535,0.195776,0.239769,0.282853,0.324949,0.377748,0.425276,0.488059,...,2.127318,2.024101,2.034097,2.136661,1.949072,2.170647,2.079376,2.064971,1.563915,2
2017-02-28,0.484199,0.398931,0.400641,0.436194,0.470389,0.53172,0.572859,0.596169,0.623811,0.643028,...,1.344301,1.2702,1.172605,1.148846,0.937528,0.887998,1.132981,0.89379,0.664127,1
2017-03-31,0.263492,0.168555,0.157626,0.237386,0.320889,0.38105,0.414451,0.435301,0.441771,0.398879,...,0.763717,0.70945,0.540674,0.330399,0.273944,0.270933,0.159492,0.017497,-0.065061,1
2017-04-30,0.350412,0.609415,0.402756,0.494441,0.545367,0.577342,0.595652,0.551609,0.509443,0.442834,...,-0.187773,-0.346976,-0.421439,-0.445218,-0.466354,-0.518676,-0.528607,-0.416464,-0.623176,2
2017-05-31,0.167277,0.057477,-0.028681,0.093749,0.113651,0.133755,0.125145,0.153855,0.207044,0.253551,...,0.641738,0.679107,0.567584,0.559127,0.376234,0.25422,0.37983,0.270595,0.05505,2


### get the underlying numpy arrays for training and test sets

In [53]:
# get the numpy array containing the training set initial features (grid points)
X_train = X_df_train.drop(labels=[region_name],axis=1).values

# get the numpy array containing the training set target values (y)
y_train =  X_df_train.loc[:,region_name].values

# get the numpy array containing the test set initial features (grid points)
X_test = X_df_test.drop(labels=[region_name],axis=1).values

# get the numpy array containing the test set target values (y)
y_test = X_df_test.loc[:,region_name].values

### standardize 

In [54]:
scaler = StandardScaler() 

### fit on the training set 

In [55]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

### transform the training and then the test set 

In [56]:
X_train_std = scaler.transform(X_train)

In [57]:
X_test_std = scaler.transform(X_test)

### Principal Component Analysis 

In [58]:
percent_variance = 0.9

#### now try the kernel PCA (implementation)

In [59]:
if kernel_PCA: 
    skpca = KPCA(copy_X=True)
else: 
    skpca = PCA(n_components=percent_variance)

### get the PCs on the training set 

In [60]:
skpca = skpca.fit(X_train_std)

In [61]:
X_train_PC = skpca.transform(X_train_std)

In [62]:
X_train_PC.shape

(285, 33)

### get the PCS on the test set by applying the learned transformation on the test set 

In [63]:
X_test_PC = skpca.transform(X_test_std)

In [64]:
tmp = pd.DataFrame(X_test_PC, index=X_df_test.index)

### Now put back together the PCs and the target time-series in Dataframes, for AUTOGLUON 

In [65]:
# assign the training set PCs to a DataFrame 
df_train = pd.DataFrame(X_train_PC) 

# add the target values to the training DataFrame 
df_train.loc[:,region_name] = y_train

# assign the test set Pcs to a DataFrame 
df_test = pd.DataFrame(X_test_PC) 

# add the target values to the test DataFrame
df_test.loc[:,region_name] = y_test

### output path 

In [66]:
opath = pathlib.Path(f'./autogluon_exp_SKPCA_noK_{provider}_{GCM}_{var_X}pred_{domain}_pred_domain_{region_name}_reg_{target_var}_targetvar_{target_type}_target_type')

### train the predictor

In [67]:
predictor = task.fit(train_data=df_train, label=region_name, auto_stack=True, output_directory=opath)

Beginning AutoGluon training ...
AutoGluon will save models to autogluon_exp_SKPCA_noK_CDS_ECMWF_t2mpred_ext_regional_pred_domain_ESI_reg_TMEAN_targetvar_cat_3_target_type/
Train Data Rows:    285
Train Data Columns: 34
Preprocessing data ...
Here are the first 10 unique label values in your data:  [1 2 3]
AutoGluon infers your prediction problem is: multiclass  (because dtype of label-column == int, but few unique label-values observed)
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Feature Generator processed 285 data points with 33 features
Original Features:
	float features: 33
Generated Features:
	int features: 0
All Features:
	float features: 33
	int features: 0
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will

In [68]:
y_pred = predictor.predict(df_test.drop(labels=[region_name],axis=1))

In [69]:
predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True, silent=True)

OrderedDict([('accuracy', 0.6111111111111112),
             ('accuracy_score', 0.6111111111111112),
             ('balanced_accuracy_score', 0.4505050505050505),
             ('matthews_corrcoef', 0.19968327476189798),
             ('<lambda>',
              {'1': {'precision': 0.3333333333333333,
                'recall': 0.2,
                'f1-score': 0.25,
                'support': 5},
               '2': {'precision': 0.6,
                'recall': 0.3333333333333333,
                'f1-score': 0.42857142857142855,
                'support': 9},
               '3': {'precision': 0.6428571428571429,
                'recall': 0.8181818181818182,
                'f1-score': 0.7200000000000001,
                'support': 22},
               'accuracy': 0.6111111111111112,
               'macro avg': {'precision': 0.5253968253968254,
                'recall': 0.4505050505050505,
                'f1-score': 0.4661904761904762,
                'support': 36},
               'weighted 

In [70]:
d = predictor.leaderboard(silent=True)

In [71]:
d

Unnamed: 0,model,score_val,fit_time,pred_time_val,stack_level
10,weighted_ensemble_k0_l1,0.582456,0.403073,0.00104,1
8,NeuralNetClassifier_STACKER_l0,0.554386,10.648514,0.474649,0
7,CatboostClassifier_STACKER_l0,0.547368,4.381124,0.02989,0
9,LightGBMClassifierCustom_STACKER_l0,0.522807,5.289958,0.027971,0
2,ExtraTreesClassifierGini_STACKER_l0,0.519298,2.140531,0.54669,0
1,RandomForestClassifierEntr_STACKER_l0,0.515789,2.700881,0.731179,0
3,ExtraTreesClassifierEntr_STACKER_l0,0.515789,2.162441,0.698593,0
6,LightGBMClassifier_STACKER_l0,0.505263,1.380906,0.024874,0
0,RandomForestClassifierGini_STACKER_l0,0.501754,2.617137,0.966664,0
5,KNeighborsClassifierDist_STACKER_l0,0.501754,0.201173,0.586718,0


In [72]:
from evaluation import calc_accuracy_sco

In [73]:
y_pred_probs = predictor.predict_proba(df_test.drop(labels=[region_name],axis=1))

In [74]:
y_pred_probs = y_pred_probs * 100.

In [75]:
df = pd.DataFrame(np.c_[y_test, y_pred_probs], index=X_df_test.index) 

In [76]:
df.loc[:,0] = df.loc[:,0].astype(np.int)

In [77]:
df.head()

Unnamed: 0_level_0,0,1,2,3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,2,16.465469,38.513417,45.021114
2017-02-28,1,33.845301,33.03665,33.118049
2017-03-31,1,22.320578,30.257379,47.422043
2017-04-30,2,15.921915,22.51404,61.564046
2017-05-31,2,15.142522,26.247085,58.610393


In [78]:
tolerance = True

In [79]:
sco_acc = calc_accuracy_sco(df, tolerance=tolerance)

In [80]:
sco_acc

0.6666666666666666

In [81]:
d

Unnamed: 0,model,score_val,fit_time,pred_time_val,stack_level
10,weighted_ensemble_k0_l1,0.582456,0.403073,0.00104,1
8,NeuralNetClassifier_STACKER_l0,0.554386,10.648514,0.474649,0
7,CatboostClassifier_STACKER_l0,0.547368,4.381124,0.02989,0
9,LightGBMClassifierCustom_STACKER_l0,0.522807,5.289958,0.027971,0
2,ExtraTreesClassifierGini_STACKER_l0,0.519298,2.140531,0.54669,0
1,RandomForestClassifierEntr_STACKER_l0,0.515789,2.700881,0.731179,0
3,ExtraTreesClassifierEntr_STACKER_l0,0.515789,2.162441,0.698593,0
6,LightGBMClassifier_STACKER_l0,0.505263,1.380906,0.024874,0
0,RandomForestClassifierGini_STACKER_l0,0.501754,2.617137,0.966664,0
5,KNeighborsClassifierDist_STACKER_l0,0.501754,0.201173,0.586718,0


In [82]:
region_name

'ESI'