In [1]:
# Parameters 

region_name = 'NNI'
var_name = 'TMEAN'
target_type = 'cat3_categories'
GCM = 'ECMWF'
# GCM = 'All'
standardized = False

In [2]:
%matplotlib inline

In [3]:
import os 
import sys 
import pathlib

In [4]:
HOME = pathlib.Path.home()

In [5]:
from matplotlib import pyplot as plt

In [6]:
import numpy as np 
import pandas as pd

In [7]:
from pycaret import datasets

In [8]:
from pycaret.classification import *

In [9]:
dpath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [10]:
list(dpath.glob("*.csv"))

[PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_training_set.csv')]

In [11]:
if standardized: 
    train_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)
else: 
    train_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)    

### small function that prepares the data for ingestion in `PyCARET` 

In [12]:
# %%writefile ../../../../ml4seas/GCM/prepare_data_CSV_to_CARET.py
def prepare_data_CSV_to_CARET(data, gcm_index=-13, GCM='All', region_name=None, target_type=None, scaling=True, doPCA=True, n_components=0.9): 
    """
    Small function that prepares the data initially contained in the 
    processed CSV files for ingestion into PyCARET
    
    Arguments
    --------- 
    
    - gcm_index : negative index (always) indicating what 
            is the last column containing the predictor variable 
            default is -12 for the 'ext_regional' geographical 
            domain 
    - GCM: 'All' or specific GCM in ['CMCC', 'CanCM4i', 'CanSIPSv2', 
                                    'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
                                    'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO']
    - region_name : ['NNI','WNI','ENI','NSI','WSI','ESI']
    - target_type : 'cat3_category' or 'anomalies'
    
    Return
    ------
    
    - data : the data filtered by GCM (if not 'All'), region_name and target type 
    - GCM_index: the GCMs index 

    """
    
    import numpy as np 
    import pandas as pd 
    from sklearn.preprocessing import  StandardScaler 
    from sklearn.decomposition import PCA

    # extract one GCM if not 'All'
    if GCM != 'All': 
        data = data.query(f"GCM == '{GCM}'") 
    
    # GCM name and associated index
    GCMs_name = data.loc[:,['GCM']]  
    
    # GCM (features)
    GCM_data = data.iloc[:,0:gcm_index]
    
    # associated index
    index = GCM_data.index
    
    # associated column names 
    cols = GCM_data.columns

    # target variable 
    target = data.loc[:,[f'{region_name}_{target_type}']]
    
    # get the values for X 
    X = GCM_data.values
    
    if scaling:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else: 
        scaler = None
    
    if doPCA: 
        pca = PCA(n_components=n_components)
        pca = pca.fit(X)
        X = pca.transform(X)
        npcs = X.shape[1]
    else: 
        pca = None

    # now casts X into a DataFrame
    if doPCA: 
        df = pd.DataFrame(X, index=index, columns=[f"PC{i}" for i in range(1, npcs+1)])
    else: 
        df = pd.DataFrame(GCM_data.values, index=index, columns=cols)
        
    # add the target variable 
    data = pd.concat([df, target], axis=1)
    
    return data, GCMs_name, scaler, pca

In [13]:
train_data, GCMs_name_train, scaler, pca = prepare_data_CSV_to_CARET(train_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=True, doPCA=True)

In [14]:
train_data.shape

(288, 34)

In [15]:
train_data.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28',
       'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'NNI_cat3_categories'],
      dtype='object')

In [16]:
test_data, GCMs_name_test, _, _ = prepare_data_CSV_to_CARET(test_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=False, doPCA=False)

In [17]:
test_data.shape

(33, 4930)

In [18]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
pca

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

### set up the PYCARET experiment, use only the training set (cross validation will be used to evaluate the models)

### Note that the first time around, the argument `silent` is left to the default (False) so that the user can verify the data type of each column after the first run and if data types correctly interpreted, `silent` can be set to True

In [20]:
silent = True

In [21]:
exp_clf = setup(data = train_data, target = f'{region_name}_{target_type}', session_id=123,
                  normalize = False, 
                  transformation = False, 
                  pca=False, silent=silent)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(288, 34)"
4,Missing Values,False
5,Numeric Features,33
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### quick comparison of the models 

In [22]:
df_compare_models = compare_models()

In [23]:
df_compare_models.data

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Light Gradient Boosting Machine,0.6021,0.0,0.5736,0.5992,0.5871,0.3742
1,CatBoost Classifier,0.5919,0.0,0.5425,0.5725,0.5617,0.3368
2,Extra Trees Classifier,0.5771,0.0,0.5217,0.5602,0.5407,0.3066
3,Extreme Gradient Boosting,0.5771,0.0,0.5429,0.5659,0.5606,0.3292
4,K Neighbors Classifier,0.5764,0.0,0.5342,0.5643,0.5488,0.3182
5,Ridge Classifier,0.5717,0.0,0.5387,0.5538,0.5483,0.3255
6,Linear Discriminant Analysis,0.5717,0.0,0.5443,0.5575,0.5532,0.3305
7,SVM - Linear Kernel,0.5676,0.0,0.5432,0.5564,0.5522,0.3232
8,Gradient Boosting Classifier,0.5671,0.0,0.5377,0.5585,0.5517,0.3146
9,Logistic Regression,0.5669,0.0,0.5398,0.5437,0.5431,0.3212


In [24]:
df_compare_models.data.loc[[0],:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Light Gradient Boosting Machine,0.6021,0.0,0.5736,0.5992,0.5871,0.3742


In [25]:
lg = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.619,0.5607,0.3051
1,0.35,0.0,0.3074,0.3724,0.3426,-0.0442
2,0.5,0.0,0.4963,0.5464,0.517,0.2424
3,0.5,0.0,0.4259,0.4149,0.4535,0.1968
4,0.6,0.0,0.5519,0.55,0.5595,0.3574
5,0.75,0.0,0.7296,0.7679,0.7467,0.6154
6,0.8,0.0,0.7926,0.8062,0.7944,0.6923
7,0.5,0.0,0.4296,0.3898,0.4369,0.2157
8,0.7,0.0,0.7296,0.775,0.7097,0.5489
9,0.75,0.0,0.7185,0.75,0.75,0.6124


### first pathway: tune the best model 

In [26]:
tuned_best_model = tune_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.5619,0.5556,0.3201
1,0.6,0.0,0.5407,0.5936,0.5723,0.3443
2,0.5,0.0,0.4963,0.5438,0.5171,0.2308
3,0.45,0.0,0.3704,0.3625,0.4,0.1057
4,0.55,0.0,0.4852,0.5,0.5095,0.2771
5,0.6,0.0,0.5593,0.59,0.5927,0.3701
6,0.8,0.0,0.7741,0.8148,0.7994,0.6813
7,0.4,0.0,0.3333,0.3557,0.3765,0.0514
8,0.65,0.0,0.6444,0.7065,0.6642,0.4677
9,0.75,0.0,0.7296,0.775,0.7451,0.5968


### doesnt work .. 

In [27]:
test_data.shape

(33, 4930)

In [28]:
test_data.head()

Unnamed: 0,"(-70.0, 70.0)","(-70.0, 72.5)","(-70.0, 75.0)","(-70.0, 77.5)","(-70.0, 80.0)","(-70.0, 82.5)","(-70.0, 85.0)","(-70.0, 87.5)","(-70.0, 90.0)","(-70.0, 92.5)",...,"(60.0, 280.0)","(60.0, 282.5)","(60.0, 285.0)","(60.0, 287.5)","(60.0, 290.0)","(60.0, 292.5)","(60.0, 295.0)","(60.0, 297.5)","(60.0, 300.0)",NNI_cat3_categories
2017-04-30,0.350412,0.609415,0.402756,0.494441,0.545367,0.577342,0.595652,0.551609,0.509443,0.442834,...,-0.187773,-0.346976,-0.421439,-0.445218,-0.466354,-0.518676,-0.528607,-0.416464,-0.623176,3.0
2017-05-31,0.167277,0.057477,-0.028681,0.093749,0.113651,0.133755,0.125145,0.153855,0.207044,0.253551,...,0.641738,0.679107,0.567584,0.559127,0.376234,0.25422,0.37983,0.270595,0.05505,3.0
2017-06-30,-0.037492,-0.200014,-0.077527,0.049841,0.124295,0.196644,0.237064,0.281379,0.293671,0.282525,...,-0.026262,-0.112167,-0.145194,-0.129408,-0.066313,-0.177321,-0.259453,-0.087841,-0.179672,3.0
2017-07-31,0.24102,0.126914,0.265512,0.330046,0.428405,0.540412,0.607662,0.652604,0.647708,0.65447,...,0.183818,0.04867,-0.025598,0.03396,-0.108763,-0.028818,0.147046,0.062604,0.05951,2.0
2017-08-31,0.347808,0.32761,0.384274,0.453615,0.468264,0.500675,0.520086,0.53589,0.554246,0.594686,...,-0.206735,-0.482101,-0.51986,-0.337142,-0.322366,-0.332399,-0.375129,-0.291196,-0.264078,3.0


In [29]:
test_data.mean()

(-70.0, 70.0)          0.578987
(-70.0, 72.5)          0.478256
(-70.0, 75.0)          0.464598
(-70.0, 77.5)          0.501343
(-70.0, 80.0)          0.470704
                         ...   
(60.0, 292.5)          0.140480
(60.0, 295.0)          0.269352
(60.0, 297.5)          0.144552
(60.0, 300.0)          0.083423
NNI_cat3_categories    2.909091
Length: 4930, dtype: float64

### transform the test data first (standardization using the scaler 'fitted' on the training data)

In [30]:
x = scaler.transform(test_data.drop(f"{region_name}_{target_type}", axis=1))

### now perform PCA on scaled GCM outputs in the test set 

In [34]:
x = pca.transform(x)

In [35]:
x.shape

(33, 33)

In [36]:
train_data.shape

(288, 34)

### make a dataframe using the indices of test data and columns of train_data 

In [37]:
test_data_df = pd.DataFrame(x, index=test_data.index, columns=train_data.columns[:-1])

In [39]:
test_data_df.shape

(33, 33)

### and now add the target column 

In [40]:
test_data.columns[-1]

'NNI_cat3_categories'

In [48]:
test_data_df.loc[:,f"{region_name}_{target_type}"] = test_data.iloc[:,-1]

### Now apply the prediction method the `lightgbm` model to the test data 

In [51]:
y_hat = lg.predict(test_data_df.iloc[:,:-1])

In [54]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.7575757575757576

In [55]:
blend_all = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6667,0.0,0.6392,0.6905,0.6589,0.4674
1,0.55,0.0,0.5148,0.5114,0.525,0.2857
2,0.45,0.0,0.4296,0.5131,0.4724,0.1635
3,0.5,0.0,0.4074,0.4,0.4429,0.187
4,0.6,0.0,0.5519,0.55,0.5595,0.3574
5,0.75,0.0,0.7296,0.7679,0.7467,0.6154
6,0.65,0.0,0.663,0.6756,0.6479,0.4677
7,0.45,0.0,0.3815,0.428,0.4271,0.1304
8,0.65,0.0,0.6556,0.65,0.65,0.4574
9,0.6,0.0,0.5333,0.4393,0.5022,0.3416


In [56]:
y_hat = blend_all.predict(test_data_df.iloc[:,:-1])

In [57]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.8484848484848485

### third approach: blend the 3 best models (note that cannot use catboost)

In [58]:
df_compare_models.data.loc[0:5,:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Light Gradient Boosting Machine,0.6021,0.0,0.5736,0.5992,0.5871,0.3742
1,CatBoost Classifier,0.5919,0.0,0.5425,0.5725,0.5617,0.3368
2,Extra Trees Classifier,0.5771,0.0,0.5217,0.5602,0.5407,0.3066
3,Extreme Gradient Boosting,0.5771,0.0,0.5429,0.5659,0.5606,0.3292
4,K Neighbors Classifier,0.5764,0.0,0.5342,0.5643,0.5488,0.3182
5,Ridge Classifier,0.5717,0.0,0.5387,0.5538,0.5483,0.3255


### first step: create models 

In [59]:
lg = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.619,0.5607,0.3051
1,0.35,0.0,0.3074,0.3724,0.3426,-0.0442
2,0.5,0.0,0.4963,0.5464,0.517,0.2424
3,0.5,0.0,0.4259,0.4149,0.4535,0.1968
4,0.6,0.0,0.5519,0.55,0.5595,0.3574
5,0.75,0.0,0.7296,0.7679,0.7467,0.6154
6,0.8,0.0,0.7926,0.8062,0.7944,0.6923
7,0.5,0.0,0.4296,0.3898,0.4369,0.2157
8,0.7,0.0,0.7296,0.775,0.7097,0.5489
9,0.75,0.0,0.7185,0.75,0.75,0.6124


In [60]:
et = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.5992,0.5668,0.3152
1,0.55,0.0,0.5037,0.625,0.5368,0.25
2,0.5,0.0,0.4667,0.5417,0.5107,0.2188
3,0.5,0.0,0.3889,0.37,0.4042,0.1561
4,0.5,0.0,0.4185,0.4353,0.4498,0.1803
5,0.75,0.0,0.7111,0.8042,0.7175,0.6
6,0.65,0.0,0.6259,0.6864,0.6525,0.44
7,0.45,0.0,0.3333,0.2382,0.3115,0.0476
8,0.7,0.0,0.663,0.6948,0.6903,0.5219
9,0.6,0.0,0.5519,0.6071,0.5672,0.3361


In [61]:
xg = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.5992,0.5668,0.3152
1,0.4,0.0,0.3444,0.3683,0.3742,0.0283
2,0.55,0.0,0.5519,0.6312,0.5722,0.305
3,0.45,0.0,0.3519,0.3369,0.3818,0.0947
4,0.45,0.0,0.4111,0.445,0.4453,0.1339
5,0.7,0.0,0.6444,0.6682,0.675,0.5238
6,0.7,0.0,0.7111,0.7,0.7,0.5349
7,0.45,0.0,0.363,0.35,0.3929,0.1165
8,0.65,0.0,0.6741,0.6917,0.6532,0.4776
9,0.85,0.0,0.8222,0.8682,0.845,0.7619


In [62]:
blend_specific = blend_models(estimator_list = [et,xg, lg])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5714,0.0,0.5545,0.5992,0.5668,0.3152
1,0.4,0.0,0.3444,0.4,0.3786,0.0244
2,0.45,0.0,0.4296,0.5146,0.4697,0.1506
3,0.45,0.0,0.3519,0.3369,0.3818,0.0947
4,0.55,0.0,0.5148,0.5114,0.525,0.2857
5,0.8,0.0,0.7667,0.7967,0.7881,0.6875
6,0.7,0.0,0.6815,0.7364,0.7037,0.5257
7,0.5,0.0,0.4296,0.3603,0.4182,0.1903
8,0.65,0.0,0.6741,0.6714,0.6527,0.4697
9,0.75,0.0,0.7296,0.7517,0.7408,0.6094


In [63]:
y_hat = blend_specific.predict(test_data_df.iloc[:,:-1])

In [64]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.8181818181818182