In [1]:
# Parameters 

provider = 'CDS'
GCM = 'ECMWF'
var_X = 'precip'
target_var = 'RAIN'
target_type = 'cat_3'
region_name = 'NNI'

### load external modules 

In [2]:
%matplotlib inline

In [3]:
import os
import sys 
import pathlib
from shutil import copytree

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import proplot as plot

In [8]:
import numpy as np
import pandas as pd

In [9]:
import xarray as xr

In [10]:
import autogluon as ag
from autogluon import TabularPrediction as task

  Optimizer.opt_registry[name].__name__))


### load local modules 

In [11]:
sys.path.append('../../../../ml4seas/')

In [12]:
from set_root_dir import set_root_dir
from get_GCM_outputs import get_GCM_outputs
from shift_dset_time import shift_dset_time 

### set the root path for the `data` folder 

In [13]:
rpath = set_root_dir(root='gdata')

### get the HINDCASTS

#### temperatures 

In [14]:
dset_t2m_ecmwf_hindcasts, coords_hindcasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='T2M', period='hindcasts', \
                                                             rpath=rpath, domain=[90, 300, -65, 50], step=3, flatten=True, ensmean=True)

first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2016_12.nc


In [15]:
coords_hindcasts.dims

Frozen(SortedKeysDict({'lat': 47, 'lon': 85, 'time': 288}))

#### precipitation 

In [16]:
dset_precip_ecmwf_hindcasts, coords_hindcasts = get_GCM_outputs(provider=provider, \
                                                                GCM=GCM, var_name='PRECIP', period='hindcasts', \
                                                                rpath=rpath, domain=[90, 300, -65, 50], step=3, flatten=True, ensmean=True)

first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_1993_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/hindcasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2016_12.nc


In [17]:
coords_hindcasts.dims

Frozen(SortedKeysDict({'lat': 47, 'lon': 85, 'time': 288}))

### delete valid time if present 

In [18]:
if 'valid_time' in dset_precip_ecmwf_hindcasts.coords: 
    dset_precip_ecmwf_hindcasts = dset_precip_ecmwf_hindcasts.drop('valid_time')

In [19]:
if 'valid_time' in dset_t2m_ecmwf_hindcasts.coords: 
    dset_t2m_ecmwf_hindcasts = dset_t2m_ecmwf_hindcasts.drop('valid_time')

### Now get the forecasts datasets (test period)

#### temperature

In [20]:
dset_t2m_ecmwf_forecasts, coords_forecasts = get_GCM_outputs(provider=provider, \
                                                             GCM=GCM, var_name='T2M', period='forecasts', rpath=rpath, \
                                                             domain=[90, 300, -65, 50], step=3, \
                                                             flatten=True, ensmean=True)

first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2017_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/T2M/ECMWF_T2M_seasonal_anomalies_interp_2019_12.nc


In [21]:
coords_forecasts.dims

Frozen(SortedKeysDict({'lat': 47, 'lon': 85, 'time': 36}))

#### precipitation

In [22]:
dset_precip_ecmwf_forecasts, coords_forecasts = get_GCM_outputs(provider=provider, \
                                                                GCM=GCM, var_name='PRECIP', period='forecasts', rpath=rpath, \
                                                                domain=[90, 300, -65, 50], step=3, flatten=True, ensmean=True)

first file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2017_01.nc
last file is /media/nicolasf/GDATA/END19101/Working/data/GCMs/processed/forecasts/CDS/ECMWF/PRECIP/ECMWF_PRECIP_seasonal_anomalies_interp_2019_12.nc


In [23]:
dset_precip_ecmwf_forecasts

### now shifts the time index so that the time corresponds to the time of the forecast, not the initialisation time 

### also shifts to the end of the month, to correspond to the convention used in the target time-series 

In [24]:
step = 3

In [25]:
dset_t2m_ecmwf_hindcasts = shift_dset_time(dset_t2m_ecmwf_hindcasts)

In [26]:
dset_precip_ecmwf_hindcasts = shift_dset_time(dset_precip_ecmwf_hindcasts)

In [27]:
dset_t2m_ecmwf_forecasts = shift_dset_time(dset_t2m_ecmwf_forecasts)

In [28]:
dset_precip_ecmwf_forecasts = shift_dset_time(dset_precip_ecmwf_forecasts)

In [29]:
dset_precip_ecmwf_forecasts

In [30]:
dset_precip_ecmwf_forecasts

### concatenate the training (hindcast) and test (forecast) data in one dataset 

In [31]:
dset_t2m = xr.concat([dset_t2m_ecmwf_hindcasts, dset_t2m_ecmwf_forecasts], dim='time')

In [32]:
dset_precip = xr.concat([dset_precip_ecmwf_hindcasts, dset_precip_ecmwf_forecasts], dim='time')

## SELECT THE FEATURES (X) HERE 

In [33]:
var_X

't2m'

In [34]:
if var_X == 'precip': 
    X = dset_precip[var_X]
elif var_X == 't2m': 
    X = dset_t2m[var_X]

In [35]:
X

### get the time index 

In [36]:
time = X.time.to_index()

In [37]:
X_data = X.data

In [38]:
X_data.shape

(324, 3995)

In [39]:
X_df = pd.DataFrame(X_data, index=time)

In [40]:
X_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3985,3986,3987,3988,3989,3990,3991,3992,3993,3994
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-30,0.104904,0.176188,0.003983,-0.258628,-0.380725,-0.416544,-0.236770,-0.050384,0.047283,0.070338,...,-0.864625,-0.861614,-0.819140,-0.769956,-0.766006,-0.760540,-0.747155,-0.630586,-0.749156,-0.969845
1993-05-31,0.296425,0.352735,0.085273,-0.187171,-0.288131,-0.273714,-0.015459,0.324265,0.341323,0.375038,...,-0.467805,-0.464814,-0.457526,-0.453158,-0.396818,-0.380876,-0.367281,-0.433148,-0.373899,-0.676584
1993-06-30,-0.006445,0.161534,0.055442,0.028961,0.068204,0.125794,0.313595,0.607362,0.720145,0.726299,...,-0.897543,-0.943162,-0.976824,-0.960717,-0.937779,-0.865079,-0.817661,-0.741128,-0.824112,-1.325585
1993-07-31,-0.326852,-0.296365,-0.332528,-0.361938,-0.433313,-0.400285,-0.340890,-0.113637,0.064995,0.077836,...,-0.682955,-0.700439,-0.600338,-0.515830,-0.556718,-0.582272,-0.563884,-0.646257,-0.665005,-1.004136
1993-08-31,0.455479,0.503849,0.538331,0.536914,0.579548,0.541192,0.612661,0.740587,0.733461,0.646303,...,-0.811840,-0.834109,-0.762850,-0.676893,-0.674586,-0.690679,-0.739978,-0.690477,-0.867860,-1.180453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-30,0.210933,0.295718,0.406168,0.386706,0.501200,0.513379,0.548448,0.531270,0.507983,0.521302,...,0.339889,0.305024,0.343109,0.272847,0.270044,0.258088,0.261839,0.194630,0.227543,0.327937
2019-12-31,0.033188,0.095548,0.178470,0.108059,0.093378,0.106850,0.093414,0.087330,0.077367,0.083428,...,1.348291,1.271084,1.240580,1.123667,1.138107,1.108988,1.023416,0.759377,0.883280,0.993853
2020-01-31,-0.174839,-0.116303,-0.057346,-0.038480,-0.033943,-0.052710,-0.085817,-0.158022,-0.173884,-0.128167,...,1.463364,1.443479,1.444998,1.460079,1.419208,1.344592,1.209959,0.925137,0.889003,0.792083
2020-02-29,-0.071110,-0.052347,-0.024904,0.038628,0.049778,0.105771,0.115735,0.125555,0.149270,0.210484,...,0.321287,0.262588,0.259017,0.206882,0.253304,0.182341,0.082825,0.037076,0.131967,0.106256


### TARGETS 

In [41]:
dpath_target = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'targets' / 'NZ_regions' / 'NZ_6_regions'

In [42]:
targets = []
for reg in ['NNI','WNI','ENI','NSI','WSI','ESI']: 
    target = pd.read_csv(dpath_target / target_var / reg / f'TS_NZ_region_{reg}_{target_var}_3_quantiles_anoms.csv', index_col=0, parse_dates=True)
    target.columns = pd.MultiIndex.from_product([[reg],target.columns])
    targets.append(target)

In [43]:
targets = pd.concat(targets, axis=1)

In [44]:
targets.head()

Unnamed: 0_level_0,NNI,NNI,NNI,WNI,WNI,WNI,ENI,ENI,ENI,NSI,NSI,NSI,WSI,WSI,WSI,ESI,ESI,ESI
Unnamed: 0_level_1,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies,Tmean_N,cat_3,anomalies
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1979-03-31,18.278555,3,0.462528,16.051472,3,0.317965,16.732249,3,0.62154,13.811438,2,-0.093327,11.848419,1,-0.400334,13.728706,2,-0.222255
1979-04-30,16.794408,2,0.227319,14.586906,3,0.248038,14.953599,3,0.299057,12.18945,1,-0.249176,10.58958,2,-0.110858,12.033578,2,-0.208919
1979-05-31,14.695903,2,0.282907,12.52232,3,0.425773,12.716266,2,0.314655,9.888897,1,-0.215657,8.099501,1,-0.202497,9.232035,1,-0.470303
1979-06-30,12.093823,2,-0.001099,9.888909,2,0.117671,9.929897,1,-0.065854,7.19898,1,-0.300772,5.457298,1,-0.197458,6.634168,1,-0.254247
1979-07-31,10.290536,2,-0.061355,8.182231,2,0.120974,8.208954,2,-0.063564,5.534868,2,-0.058724,3.763353,2,0.085515,4.916423,2,0.112719


In [45]:
targets_anomalies = targets.loc[:, (slice(None), ["anomalies"])]

In [46]:
target_terciles = targets.loc[:, (slice(None), ["cat_3"])]

In [47]:
targets_anomalies.columns = targets_anomalies.columns.droplevel(1)

In [48]:
target_terciles.columns = target_terciles.columns.droplevel(1)

### target choice here (anomalies or tercile class, and region_name) 

In [49]:
if target_type == 'cat_3': 
    y = target_terciles.loc[:,region_name]
elif target_type == 'anomalies': 
    y = target_anomalies.loc[:,region_name]

In [50]:
print(f"\n\n\nNow going agead with TARGET {region_name}, {target_type} ------------------------ \n\n")




Now going agead with TARGET ESI, cat_3 ------------------------ 




In [51]:
X_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3985,3986,3987,3988,3989,3990,3991,3992,3993,3994
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-04-30,0.104904,0.176188,0.003983,-0.258628,-0.380725,-0.416544,-0.23677,-0.050384,0.047283,0.070338,...,-0.864625,-0.861614,-0.81914,-0.769956,-0.766006,-0.76054,-0.747155,-0.630586,-0.749156,-0.969845
1993-05-31,0.296425,0.352735,0.085273,-0.187171,-0.288131,-0.273714,-0.015459,0.324265,0.341323,0.375038,...,-0.467805,-0.464814,-0.457526,-0.453158,-0.396818,-0.380876,-0.367281,-0.433148,-0.373899,-0.676584
1993-06-30,-0.006445,0.161534,0.055442,0.028961,0.068204,0.125794,0.313595,0.607362,0.720145,0.726299,...,-0.897543,-0.943162,-0.976824,-0.960717,-0.937779,-0.865079,-0.817661,-0.741128,-0.824112,-1.325585
1993-07-31,-0.326852,-0.296365,-0.332528,-0.361938,-0.433313,-0.400285,-0.34089,-0.113637,0.064995,0.077836,...,-0.682955,-0.700439,-0.600338,-0.51583,-0.556718,-0.582272,-0.563884,-0.646257,-0.665005,-1.004136
1993-08-31,0.455479,0.503849,0.538331,0.536914,0.579548,0.541192,0.612661,0.740587,0.733461,0.646303,...,-0.81184,-0.834109,-0.76285,-0.676893,-0.674586,-0.690679,-0.739978,-0.690477,-0.86786,-1.180453


In [52]:
region_name

'ESI'

In [53]:
X_df = X_df.merge(y, left_index=True, right_index=True)

In [54]:
X_df = X_df.dropna(axis=0)

In [55]:
X_df.columns

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
        3986,  3987,  3988,  3989,  3990,  3991,  3992,  3993,  3994, 'ESI'],
      dtype='object', length=3996)

### include the month of the forecast as a predictor variable 

In [56]:
# X_df.loc[:,'month'] = X_df.index.month

### Now selects the training and the test data 

#### last 3 years 

In [57]:
index_test = len(X_df) - (3 * 12)

In [58]:
index_test

285

In [59]:
train_data = X_df.iloc[:index_test,:]

In [60]:
test_data = X_df.iloc[index_test:,:]

In [61]:
train_data.shape[0] + test_data.shape[0]

321

In [62]:
len(X_df)

321

In [63]:
train_data = task.Dataset(train_data)

In [64]:
train_data.columns

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
        3986,  3987,  3988,  3989,  3990,  3991,  3992,  3993,  3994, 'ESI'],
      dtype='object', length=3996)

In [65]:
opath = pathlib.Path(f'./autogluon_exp_opt_{provider}_{GCM}_{var_X}pred_{region_name}_reg_{target_var}_targetvar_{target_type}_target_type')

In [66]:
if not opath.exists(): 
    opath.mkdir(parents=True)

### options 

In [67]:
# hp_tune = True  # whether or not to do hyperparameter optimization

# nn_options = { # specifies non-default hyperparameter values for neural network models
#     'num_epochs': 10, # number of training epochs (controls training time of NN models)
#     'learning_rate': ag.space.Real(1e-4, 1e-2, default=5e-4, log=True), # learning rate used in training (real-valued hyperparameter searched on log-scale)
#     'activation': ag.space.Categorical('relu', 'softrelu', 'tanh'), # activation function used in NN (categorical hyperparameter, default = first entry)
#     'layers': ag.space.Categorical([100],[1000],[200,100],[300,200,100]),
#       # Each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use
#     'dropout_prob': ag.space.Real(0.0, 0.5, default=0.1), # dropout probability (real-valued hyperparameter)
# }

# gbm_options = { # specifies non-default hyperparameter values for lightGBM gradient boosted trees
#     'num_boost_round': 100, # number of boosting rounds (controls training time of GBM models)
#     'num_leaves': ag.space.Int(lower=26, upper=66, default=36), # number of leaves in trees (integer hyperparameter)
# }

# hyperparameters = {'NN': nn_options, 'GBM': gbm_options}  # hyperparameters of each model type
# # If one of these keys is missing from hyperparameters dict, then no models of that type are trained.

# time_limits = 2*60  # train various models for ~2 min
# num_trials = 5  # try at most 3 different hyperparameter configurations for each type of model
# search_strategy = 'skopt'  # to tune hyperparameters using SKopt Bayesian optimization routine


In [68]:
train_data.columns

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
        3986,  3987,  3988,  3989,  3990,  3991,  3992,  3993,  3994, 'ESI'],
      dtype='object', length=3996)

In [69]:
predictor = task.fit(train_data=train_data, label=region_name, auto_stack=True, output_directory=opath)

Beginning AutoGluon training ...
AutoGluon will save models to autogluon_exp_opt_CDS_ECMWF_t2mpred_ESI_reg_TMEAN_targetvar_cat_3_target_type/
Train Data Rows:    285
Train Data Columns: 3996
Preprocessing data ...
Here are the first 10 unique label values in your data:  [1 2 3]
AutoGluon infers your prediction problem is: multiclass  (because dtype of label-column == int, but few unique label-values observed)
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Feature Generator processed 285 data points with 3995 features
Original Features:
	float features: 3995
Generated Features:
	int features: 0
All Features:
	float features: 3995
	int features: 0
	Data preprocessing and feature engineering runtime = 4.99s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models usin

In [70]:
# predictor = task.fit(train_data=train_data, label=region_name,
#                      output_directory=opath, time_limits=time_limits, num_trials=num_trials,
#                      hyperparameter_tune=hp_tune, hyperparameters=hyperparameters,
#                      search_strategy=search_strategy)

### get the test data into a task.Dataset 

In [71]:
test_data = task.Dataset(test_data)

y_test = test_data[region_name].values # values to predict

test_data_nolab = test_data.drop(labels=[region_name],axis=1) # delete label column to prove we're not cheating

In [72]:
y_pred = predictor.predict(test_data_nolab)

In [73]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.5555555555555556
Evaluations on test data:
{
    "accuracy": 0.5555555555555556,
    "accuracy_score": 0.5555555555555556,
    "balanced_accuracy_score": 0.30303030303030304,
    "matthews_corrcoef": -0.12263425353455155
}
  'precision', 'predicted', average, warn_for)
Detailed (per-class) classification report:
{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 5
    },
    "2": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 9
    },
    "3": {
        "precision": 0.5882352941176471,
        "recall": 0.9090909090909091,
        "f1-score": 0.7142857142857143,
        "support": 22
    },
    "accuracy": 0.5555555555555556,
    "macro avg": {
        "precision": 0.19607843137254902,
        "recall": 0.30303030303030304,
        "f1-score": 0.2380952380952381,
        "support": 36
    },
    "weighted avg": {
        "precision": 0.359477124183

In [74]:
y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

### ACC (pure)

In [75]:
(test_data.loc[:,region_name].values == y_pred).sum() / len(y_pred)

0.5555555555555556

In [76]:
y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [77]:
test_data.loc[:,region_name].values

array([2, 1, 1, 2, 2, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 3, 3, 3, 2,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3])

In [78]:
from sklearn import metrics

In [79]:
metrics.confusion_matrix(test_data.loc[:,region_name].values,y_pred)

array([[ 0,  0,  5],
       [ 0,  0,  9],
       [ 2,  0, 20]])

In [80]:
d = predictor.leaderboard(silent=True)

In [81]:
region_name

'ESI'

In [82]:
d

Unnamed: 0,model,score_val,fit_time,pred_time_val,stack_level
10,weighted_ensemble_k0_l1,0.617544,0.419375,0.000829,1
7,CatboostClassifier_STACKER_l0,0.617544,418.54248,3.565936,0
8,NeuralNetClassifier_STACKER_l0,0.6,20.030858,1.970192,0
6,LightGBMClassifier_STACKER_l0,0.6,42.355293,3.385814,0
9,LightGBMClassifierCustom_STACKER_l0,0.578947,242.667607,3.585095,0
3,ExtraTreesClassifierEntr_STACKER_l0,0.561404,5.89451,3.965996,0
2,ExtraTreesClassifierGini_STACKER_l0,0.557895,5.447706,3.802136,0
0,RandomForestClassifierGini_STACKER_l0,0.550877,5.988675,3.797569,0
1,RandomForestClassifierEntr_STACKER_l0,0.547368,7.008385,4.262236,0
5,KNeighborsClassifierDist_STACKER_l0,0.505263,6.77519,7.237221,0


In [83]:
predictor.save()

TabularPredictor saved. To load, use: TabularPredictor.load("autogluon_exp_opt_CDS_ECMWF_t2mpred_ESI_reg_TMEAN_targetvar_cat_3_target_type/")


### predict the probabilities and then calculates the accuracy the "SCO way": i.e. with tolerance of 5% on the target category probability

In [84]:
probs = predictor.predict_proba(test_data_nolab)

In [85]:
y_probas = pd.DataFrame(probs, columns=['T1','T2','T3'])

In [86]:
y_probas.loc[:,'y_pred'] = y_pred

In [87]:
y_probas.loc[:,'y_obs'] = y_test

In [88]:
y_probas.index = test_data.index

In [89]:
(test_data.loc[:,region_name].values == y_pred).sum() / len(y_pred)

0.5555555555555556

In [90]:
y_probas.head()

Unnamed: 0_level_0,T1,T2,T3,y_pred,y_obs
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-31,0.184003,0.348129,0.467868,3,2
2017-02-28,0.21692,0.359575,0.423504,3,1
2017-03-31,0.179058,0.286624,0.534318,3,1
2017-04-30,0.16527,0.316457,0.518273,3,2
2017-05-31,0.166734,0.292485,0.540781,3,2


In [91]:
y_probas.loc[:,['T1','T2','T3']] *= 100.

In [92]:
y_probas.head()

Unnamed: 0_level_0,T1,T2,T3,y_pred,y_obs
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-31,18.40027,34.812914,46.786816,3,2
2017-02-28,21.692019,35.957545,42.350436,3,1
2017-03-31,17.905779,28.662431,53.43179,3,1
2017-04-30,16.527044,31.645679,51.827277,3,2
2017-05-31,16.67339,29.248513,54.078098,3,2


In [93]:
sys.path.append('../../../../ml4seas')

In [94]:
from evaluation import calc_accuracy_sco

In [95]:
datac = y_probas.loc[:,['T1','T2','T3','y_obs']]

In [96]:
tolerance = True

In [97]:
obs = datac.y_obs.values.astype(np.int)
probs = datac.loc[:,['T1','T2','T3']].values
df = pd.DataFrame(np.c_[obs, probs], index=datac.index)

In [98]:
df.loc[:,0] = df.loc[:,0].astype(np.int)

In [99]:
acc = calc_accuracy_sco(df, tolerance=tolerance)

In [100]:
print(acc)

0.5833333333333334


In [101]:
if acc > 0.6:
    opath_acc = str(opath) + f'_ACC{acc*100:4.1f}percent'
    if not pathlib.Path(opath_acc).exists():
        copytree(opath, opath_acc)

In [102]:
region_name

'ESI'