In [1]:
# Parameters 

application = 'Apple_and_Pears'
varname = 'RAIN_BC'
num_quantiles = 3
target_type = f'cat{num_quantiles}'

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
%matplotlib inline

In [5]:
import sys 
import pathlib

In [6]:
import matplotlib.pyplot as plt 

In [7]:
import numpy as np 
import pandas as pd 

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

In [10]:
import autogluon as ag
from autogluon import TabularPrediction as task

In [11]:
np.random.seed(42)

In [12]:
HOME = pathlib.Path.home()
CWD = pathlib.Path.cwd()

### read the target variable 

In [13]:
ipath_target = pathlib.Path(HOME.joinpath(f"research/Smart_Ideas/outputs/targets/application_cases/{application}/SEASONAL/{varname}"))

In [14]:
target = pd.read_csv(ipath_target.joinpath(f"Seasonal_{varname}_sum_anomalies_and_Q{num_quantiles}_categories.csv"), index_col=0, parse_dates=True)

In [15]:
target = target.loc[:,[target_type]]

### reads the SSTs 

In [16]:
ipath_sst = pathlib.Path(HOME.joinpath("/media/nicolasf/END19101/data/ERSST/processed"))

In [17]:
lfiles_sst = list(ipath_sst.glob("*.nc")) 

In [18]:
lfiles_sst.sort() 

In [19]:
lfiles_sst[0]

PosixPath('/media/nicolasf/END19101/data/ERSST/processed/ERSST_seasonal_anomalies_1979-03.nc')

In [20]:
lfiles_sst[-1]

PosixPath('/media/nicolasf/END19101/data/ERSST/processed/ERSST_seasonal_anomalies_2020-03.nc')

In [21]:
import xarray as xr

In [22]:
dset_sst = xr.open_mfdataset(lfiles_sst, concat_dim='time')

In [23]:
dset_sst

Unnamed: 0,Array,Chunk
Bytes,31.59 MB,64.08 kB
Shape,"(493, 89, 180)","(1, 89, 180)"
Count,1972 Tasks,493 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 31.59 MB 64.08 kB Shape (493, 89, 180) (1, 89, 180) Count 1972 Tasks 493 Chunks Type float32 numpy.ndarray",180  89  493,

Unnamed: 0,Array,Chunk
Bytes,31.59 MB,64.08 kB
Shape,"(493, 89, 180)","(1, 89, 180)"
Count,1972 Tasks,493 Chunks
Type,float32,numpy.ndarray


In [24]:
dset_sst = dset_sst.sel(lat=slice(-60, 40), lon=slice(120, 360-70)) 

In [25]:
dset_sst = dset_sst.stack(s=('lat','lon'))

In [26]:
dset_sst = dset_sst.drop('month')

In [27]:
dset_sst

Unnamed: 0,Array,Chunk
Bytes,8.65 MB,17.54 kB
Shape,"(493, 4386)","(1, 4386)"
Count,2958 Tasks,493 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 8.65 MB 17.54 kB Shape (493, 4386) (1, 4386) Count 2958 Tasks 493 Chunks Type float32 numpy.ndarray",4386  493,

Unnamed: 0,Array,Chunk
Bytes,8.65 MB,17.54 kB
Shape,"(493, 4386)","(1, 4386)"
Count,2958 Tasks,493 Chunks
Type,float32,numpy.ndarray


### drop the land points 

In [28]:
dset_sst = dset_sst.dropna('s')

In [29]:
dset_sst.load()

In [30]:
sst_data = dset_sst['sst'].data

In [31]:
sst_data.shape

(493, 3981)

In [32]:
df_sst = pd.DataFrame(sst_data, index=dset_sst.time.to_index())

In [33]:
df_sst

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3971,3972,3973,3974,3975,3976,3977,3978,3979,3980
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1979-03-31,0.408074,0.393365,0.341133,0.272518,0.230934,0.213784,0.176597,0.125668,0.128945,0.103911,...,0.211047,0.039492,-0.132868,-0.375391,-0.581040,-0.658976,-0.299319,-0.297215,-0.267769,-0.302244
1979-04-30,0.322899,0.318706,0.287447,0.250925,0.246420,0.266295,0.262981,0.241939,0.268605,0.262676,...,0.066901,-0.050432,-0.151484,-0.316579,-0.452255,-0.487476,-0.701745,-0.680854,-0.619628,-0.596596
1979-05-31,0.229016,0.242395,0.237718,0.237025,0.271487,0.327504,0.354482,0.359286,0.407984,0.422589,...,0.134351,0.018488,-0.067689,-0.179603,-0.232039,-0.215237,-0.494565,-0.457495,-0.419294,-0.426657
1979-06-30,0.144429,0.168876,0.181095,0.201978,0.256738,0.329605,0.369319,0.384593,0.441541,0.463330,...,0.210085,0.048139,-0.067071,-0.187179,-0.239196,-0.226593,-0.735584,-0.626827,-0.475826,-0.346974
1979-07-31,0.116445,0.142913,0.156770,0.178012,0.230859,0.299646,0.333117,0.342831,0.397017,0.418095,...,0.282922,0.129267,0.012770,-0.084379,-0.098445,-0.077531,-0.845969,-0.696033,-0.487444,-0.335592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-30,0.001151,-0.014011,-0.008559,0.000645,0.006876,-0.007457,-0.055329,-0.103111,-0.129655,-0.146242,...,1.207734,1.081818,0.945542,0.797485,0.644805,0.556550,0.647884,0.801991,1.037703,1.139630
2019-12-31,0.123841,0.127031,0.154982,0.188440,0.216768,0.216269,0.172829,0.125530,0.107368,0.110442,...,0.870107,0.633703,0.421892,0.260812,0.157567,0.133620,0.290326,0.422558,0.668333,0.840658
2020-01-31,0.214608,0.220011,0.248689,0.281688,0.307191,0.300299,0.247288,0.190000,0.169344,0.178798,...,0.636421,0.412009,0.226782,0.120230,0.102698,0.162117,0.281559,0.414516,0.673264,0.895002
2020-02-29,0.286062,0.281482,0.291016,0.299659,0.302084,0.276531,0.209465,0.141861,0.114179,0.116492,...,0.473502,0.316649,0.184557,0.098353,0.069825,0.117501,0.758205,0.827671,0.977060,1.087330


### shift 

In [34]:
step = 4

In [35]:
df_sst.index = df_sst.index.shift(periods=step, freq='M')

In [36]:
target.index.freq = 'M'

In [37]:
target.index

DatetimeIndex(['1979-03-31', '1979-04-30', '1979-05-31', '1979-06-30',
               '1979-07-31', '1979-08-31', '1979-09-30', '1979-10-31',
               '1979-11-30', '1979-12-31',
               ...
               '2019-03-31', '2019-04-30', '2019-05-31', '2019-06-30',
               '2019-07-31', '2019-08-31', '2019-09-30', '2019-10-31',
               '2019-11-30', '2019-12-31'],
              dtype='datetime64[ns]', name='time', length=490, freq='M')

In [38]:
df = pd.concat([df_sst, target], axis=1)

In [39]:
df = df.dropna(axis=0)

In [40]:
train_data = df.loc[None:'2015-12',:]

In [41]:
test_data = df.loc['2016-01':None, :]

### saved models 

In [42]:
saved_models = pathlib.Path('./saved_models/AUTOGLUON_v3/')

In [43]:
opath = saved_models.joinpath(f'./autogluon_exp_SKPCA_SSTobs_1981_2010_pred_{application}_reg_{varname}_targetvar_{target_type}_target_type')

In [44]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### initialise repeated stratified cross-validation

In [45]:
kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=2).split(train_data.drop(labels=[target_type],axis=1).values, train_data.loc[:,target_type].values)

In [46]:
percent_variance = 0.8

In [None]:
test_indices = []
y_preds = []
leader_board = []
perfs = []

for k, (train, test) in enumerate(kfold):
    
    print(f"ENTERING FOLD {k} ---- ")
    
    # saves the test indices
    test_indices.append(test)
    
    # get the numpy array containing the training set initial features (grid points)
    X_train = train_data.drop(labels=[target_type],axis=1).values[train]
    
    # get the numpy array containing the training set target values (y)
    y_train =  train_data.loc[:,target_type].values[train]
    
    # get the numpy array containing the test set initial features (grid points)
    X_test = train_data.drop(labels=[target_type],axis=1).values[test]
    
    # get the numpy array containing the test set target values (y)
    y_test = train_data.loc[:,target_type].values[test]
    
    # -----------
    # standardize 
    
    # initialise the scaler (standard scaler)
    scaler = StandardScaler() 
    
    # fit on the training set features array, and transform to obtain standardized values
    X_train_std = scaler.fit_transform(X_train)
    
    # apply the transformation on the test set initial features 
    X_test_std = scaler.transform(X_test)
    
    # -----------------------------
    # Principal Component Analysis 
    
    # instantiate the pca class, with percent of variance to keep  
    
    skpca = PCA(n_components=percent_variance)
    
    # fit on the training initial (standardized) fedatures array, and transform to obtain the PCs
    X_train_PC = skpca.fit_transform(X_train_std)
    
    # apply the transformation on the test set standardized features 
    X_test_PC = skpca.transform(X_test_std)
    
    # assign the training set PCs to a DataFrame 
    df_train = pd.DataFrame(X_train_PC) 
    
    # add the target values to the training DataFrame 
    df_train.loc[:,target_type] = y_train
    
    # assign the test set Pcs to a DataFrame 
    df_test = pd.DataFrame(X_test_PC) 
    
    # add the target values to the test DataFrame
    df_test.loc[:,target_type] = y_test
    
    # fit the task predictor on the training set DataFrame 
    predictor = task.fit(train_data=df_train, label=target_type, auto_stack=True, output_directory=opath)
    
    # predict the probabilities for each class from the test set features DataFrame (droping the target values column)
#     y_pred_proba = predictor.predict_proba(df_test.drop(labels=[region_name],axis=1))
    
    # predict the class value itself
    y_pred = predictor.predict(df_test.drop(labels=[target_type],axis=1))
    
    # records the probabilities for the classes on the test set 
    y_preds.append(y_pred)
    
    # get the leaderboard DataFrame 
    d = predictor.leaderboard(silent=True)
    
    # records the leaderboard DataFrame 
    leader_board.append(d)
    
    perfs.append(predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True, silent=True))
    
    print(f"EXITING FOLD {k} ---- ")

Beginning AutoGluon training ...
AutoGluon will save models to saved_models/AUTOGLUON_v3/autogluon_exp_SKPCA_SSTobs_1981_2010_pred_Apple_and_Pears_reg_RAIN_BC_targetvar_cat3_target_type/
AutoGluon Version:  0.0.12
Train Data Rows:    350
Train Data Columns: 15
Preprocessing data ...
Here are the 3 unique label values in your data:  [1.0, 3.0, 2.0]
AutoGluon infers your prediction problem is: multiclass  (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])



ENTERING FOLD 0 ---- 


Train Data Class Count: 3
Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5543	 = Validation accuracy score
	1.8s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.5543	 = Validation accuracy score
	1.79s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5857	 = Validation accuracy score
	1.4s	 = Training runt

EXITING FOLD 0 ---- 
ENTERING FOLD 1 ---- 


Beginning AutoGluon training ...
AutoGluon will save models to saved_models/AUTOGLUON_v3/autogluon_exp_SKPCA_SSTobs_1981_2010_pred_Apple_and_Pears_reg_RAIN_BC_targetvar_cat3_target_type/
AutoGluon Version:  0.0.12
Train Data Rows:    350
Train Data Columns: 15
Preprocessing data ...
Here are the 3 unique label values in your data:  [1.0, 2.0, 3.0]
AutoGluon infers your prediction problem is: multiclass  (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Train Data Class Count: 3
Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features:

EXITING FOLD 1 ---- 
ENTERING FOLD 2 ---- 


Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5743	 = Validation accuracy score
	1.75s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.5686	 = Validation accuracy score
	1.75s	 = Training runtime
	0.18s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5829	 = Validation accuracy score
	1.35s	 = Training runtime
	0.16s	 = Validation

EXITING FOLD 2 ---- 
ENTERING FOLD 3 ---- 


Feature Generator processed 351 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5556	 = Validation accuracy score
	1.76s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.5613	 = Validation accuracy score
	1.77s	 = Training runtime
	0.16s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5613	 = Validation accuracy score
	1.36s	 = Training runtime
	0.16s	 = Validation

EXITING FOLD 3 ---- 
ENTERING FOLD 4 ---- 


Feature Generator processed 351 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5584	 = Validation accuracy score
	1.82s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.547	 = Validation accuracy score
	1.75s	 = Training runtime
	0.16s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5413	 = Validation accuracy score
	1.35s	 = Training runtime
	0.17s	 = Validation 

EXITING FOLD 4 ---- 
ENTERING FOLD 5 ---- 


Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5914	 = Validation accuracy score
	1.75s	 = Training runtime
	0.18s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.58	 = Validation accuracy score
	1.78s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5743	 = Validation accuracy score
	1.33s	 = Training runtime
	0.17s	 = Validation ru

EXITING FOLD 5 ---- 
ENTERING FOLD 6 ---- 


Train Data Class Count: 3
Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini_STACKER_l0 ...
	0.5514	 = Validation accuracy score
	1.83s	 = Training runtime
	0.17s	 = Validation runtime
Fitting model: RandomForestClassifierEntr_STACKER_l0 ...
	0.56	 = Validation accuracy score
	1.82s	 = Training runtime
	0.16s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini_STACKER_l0 ...
	0.5629	 = Validation accuracy score
	1.38s	 = Training runt

EXITING FOLD 6 ---- 
ENTERING FOLD 7 ---- 


Here are the 3 unique label values in your data:  [1.0, 3.0, 2.0]
AutoGluon infers your prediction problem is: multiclass  (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Train Data Class Count: 3
Feature Generator processed 350 data points with 14 features
Original Features (raw dtypes):
	float32 features: 14
Original Features (inferred dtypes):
	float features: 14
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 14
Final Features:
	float features: 14
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model

In [None]:
!pwd