In [None]:
# Parameters 

region_name = 'NNI'
var_name = 'TMEAN'
target_type = 'cat3_categories'
GCM = 'UKMO'
# GCM = 'All'
standardized = False

In [2]:
%matplotlib inline

In [3]:
import os 
import sys 
import pathlib

In [4]:
HOME = pathlib.Path.home()

In [5]:
from matplotlib import pyplot as plt

In [6]:
import numpy as np 
import pandas as pd

In [7]:
from pycaret import datasets

In [8]:
from pycaret.classification import *

In [9]:
dpath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [10]:
list(dpath.glob("*.csv"))

[PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_training_set.csv')]

In [11]:
if standardized: 
    train_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)
else: 
    train_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)    

### small function that prepares the data for ingestion in `PyCARET` 

In [12]:
# %%writefile ../../../../ml4seas/GCM/prepare_data_CSV_to_CARET.py
def prepare_data_CSV_to_CARET(data, gcm_index=-13, GCM='All', region_name=None, target_type=None, scaling=True, doPCA=True, n_components=0.9): 
    """
    Small function that prepares the data initially contained in the 
    processed CSV files for ingestion into PyCARET
    
    Arguments
    --------- 
    
    - gcm_index : negative index (always) indicating what 
            is the last column containing the predictor variable 
            default is -12 for the 'ext_regional' geographical 
            domain 
    - GCM: 'All' or specific GCM in ['CMCC', 'CanCM4i', 'CanSIPSv2', 
                                    'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
                                    'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO']
    - region_name : ['NNI','WNI','ENI','NSI','WSI','ESI']
    - target_type : 'cat3_category' or 'anomalies'
    
    Return
    ------
    
    - data : the data filtered by GCM (if not 'All'), region_name and target type 
    - GCM_index: the GCMs index 

    """
    
    import numpy as np 
    import pandas as pd 
    from sklearn.preprocessing import  StandardScaler 
    from sklearn.decomposition import PCA

    # extract one GCM if not 'All'
    if GCM != 'All': 
        data = data.query(f"GCM == '{GCM}'") 
    
    # GCM name and associated index
    GCMs_name = data.loc[:,['GCM']]  
    
    # GCM (features)
    GCM_data = data.iloc[:,0:gcm_index]
    
    # associated index
    index = GCM_data.index
    
    # associated column names 
    cols = GCM_data.columns

    # target variable 
    target = data.loc[:,[f'{region_name}_{target_type}']]
    
    # get the values for X 
    X = GCM_data.values
    
    if scaling:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else: 
        scaler = None
    
    if doPCA: 
        pca = PCA(n_components=n_components)
        pca = pca.fit(X)
        X = pca.transform(X)
        npcs = X.shape[1]
    else: 
        pca = None

    # now casts X into a DataFrame
    if doPCA: 
        df = pd.DataFrame(X, index=index, columns=[f"PC{i}" for i in range(1, npcs+1)])
    else: 
        df = pd.DataFrame(GCM_data.values, index=index, columns=cols)
        
    # add the target variable 
    data = pd.concat([df, target], axis=1)
    
    return data, GCMs_name, scaler, pca

In [13]:
train_data, GCMs_name_train, scaler, pca = prepare_data_CSV_to_CARET(train_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=True, doPCA=True)

In [14]:
train_data.shape

(287, 35)

In [15]:
train_data.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28',
       'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'NNI_cat3_categories'],
      dtype='object')

In [16]:
test_data, GCMs_name_test, _, _ = prepare_data_CSV_to_CARET(test_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=False, doPCA=False)

In [17]:
test_data.shape

(25, 4930)

In [18]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
pca

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

### set up the PYCARET experiment, use only the training set (cross validation will be used to evaluate the models)

### Note that the first time around, the argument `silent` is left to the default (False) so that the user can verify the data type of each column after the first run and if data types correctly interpreted, `silent` can be set to True

In [20]:
silent = True

In [21]:
exp_clf = setup(data = train_data, target = f'{region_name}_{target_type}', session_id=123,
                  normalize = False, 
                  transformation = False, 
                  pca=False, silent=silent)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(287, 35)"
4,Missing Values,False
5,Numeric Features,34
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### quick comparison of the models 

In [22]:
df_compare_models = compare_models()

In [23]:
df_compare_models.data

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.6,0.0,0.5384,0.5777,0.5602,0.3457
1,Quadratic Discriminant Analysis,0.59,0.0,0.5326,0.5745,0.5632,0.3413
2,CatBoost Classifier,0.58,0.0,0.5274,0.5531,0.5519,0.321
3,Extreme Gradient Boosting,0.57,0.0,0.5327,0.5613,0.5531,0.3187
4,Light Gradient Boosting Machine,0.57,0.0,0.5283,0.5477,0.5414,0.3176
5,SVM - Linear Kernel,0.565,0.0,0.5272,0.5666,0.5452,0.3193
6,Gradient Boosting Classifier,0.565,0.0,0.5282,0.5478,0.5405,0.3019
7,K Neighbors Classifier,0.56,0.0,0.5141,0.5837,0.5448,0.2882
8,Random Forest Classifier,0.555,0.0,0.5019,0.5222,0.518,0.2729
9,Logistic Regression,0.54,0.0,0.5082,0.5542,0.5268,0.2769


In [24]:
df_compare_models.data.loc[[0],:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.6,0.0,0.5384,0.5777,0.5602,0.3457


In [25]:
lg = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.55,0.0,0.4963,0.4275,0.4667,0.2469
1,0.5,0.0,0.4481,0.4625,0.4667,0.1968
2,0.4,0.0,0.2963,0.2571,0.313,0.0
3,0.45,0.0,0.4407,0.4562,0.4511,0.1603
4,0.7,0.0,0.6741,0.7269,0.6828,0.5102
5,0.65,0.0,0.6259,0.6723,0.659,0.4636
6,0.5,0.0,0.4481,0.4958,0.4659,0.2
7,0.4,0.0,0.4037,0.45,0.4,0.1176
8,0.75,0.0,0.7111,0.7348,0.7202,0.6047
9,0.8,0.0,0.7381,0.7932,0.7885,0.6761


### first pathway: tune the best model 

In [26]:
tuned_best_model = tune_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.45,0.0,0.4222,0.3952,0.4121,0.102
1,0.4,0.0,0.3556,0.3083,0.3481,0.04
2,0.45,0.0,0.363,0.3233,0.3625,0.0756
3,0.35,0.0,0.337,0.3875,0.3657,0.0
4,0.75,0.0,0.7407,0.75,0.7262,0.5984
5,0.6,0.0,0.5593,0.6036,0.5996,0.3774
6,0.5,0.0,0.4481,0.5,0.46,0.1736
7,0.6,0.0,0.5704,0.6071,0.595,0.3846
8,0.7,0.0,0.6444,0.675,0.6724,0.5161
9,0.5,0.0,0.4841,0.4955,0.492,0.1903


### doesnt work .. 

In [27]:
test_data.shape

(25, 4930)

In [28]:
test_data.head()

Unnamed: 0,"(-70.0, 70.0)","(-70.0, 72.5)","(-70.0, 75.0)","(-70.0, 77.5)","(-70.0, 80.0)","(-70.0, 82.5)","(-70.0, 85.0)","(-70.0, 87.5)","(-70.0, 90.0)","(-70.0, 92.5)",...,"(60.0, 280.0)","(60.0, 282.5)","(60.0, 285.0)","(60.0, 287.5)","(60.0, 290.0)","(60.0, 292.5)","(60.0, 295.0)","(60.0, 297.5)","(60.0, 300.0)",NNI_cat3_categories
2017-12-31,1.572832,1.645892,0.818441,1.262659,1.36272,1.456549,1.395253,1.354858,1.297055,1.2935,...,2.311017,2.192543,1.845751,2.02853,2.101238,1.282888,1.524736,0.966009,1.051154,3.0
2018-01-31,1.09722,1.008447,0.348687,0.817974,0.977535,1.044698,0.975897,0.923493,0.848303,0.796518,...,2.848023,2.4386,1.714919,1.902448,2.404874,1.886316,2.177145,1.626962,1.45573,3.0
2018-02-28,0.449145,0.54605,0.066672,0.50987,0.662122,0.756574,0.690453,0.651108,0.608726,0.592303,...,2.024834,1.747299,1.142935,1.369644,1.864149,1.549667,1.927517,1.47812,1.535946,3.0
2018-03-31,0.521764,0.869347,0.010139,0.596032,0.693144,0.760139,0.651636,0.569627,0.491639,0.461311,...,0.753816,0.545173,0.270525,0.209055,0.685618,0.602638,0.790758,0.633171,1.140952,3.0
2018-04-30,0.487646,1.387764,0.235522,0.886829,0.903219,1.022085,0.913095,0.834301,0.778178,0.800129,...,0.254878,0.02255,0.011356,-0.01565,0.307122,0.248923,0.254523,0.074657,0.611116,3.0


In [29]:
test_data.mean()

(-70.0, 70.0)          0.491036
(-70.0, 72.5)          0.525477
(-70.0, 75.0)          0.389227
(-70.0, 77.5)          0.471907
(-70.0, 80.0)          0.466594
                         ...   
(60.0, 292.5)          0.626914
(60.0, 295.0)          0.693336
(60.0, 297.5)          0.548658
(60.0, 300.0)          0.846967
NNI_cat3_categories    2.920000
Length: 4930, dtype: float64

### transform the test data first (standardization using the scaler 'fitted' on the training data)

In [30]:
x = scaler.transform(test_data.drop(f"{region_name}_{target_type}", axis=1))

### now perform PCA on scaled GCM outputs in the test set 

In [31]:
x = pca.transform(x)

In [32]:
x.shape

(25, 34)

In [33]:
train_data.shape

(287, 35)

### make a dataframe using the indices of test data and columns of train_data 

In [34]:
test_data_df = pd.DataFrame(x, index=test_data.index, columns=train_data.columns[:-1])

In [35]:
test_data_df.shape

(25, 34)

### and now add the target column 

In [36]:
test_data.columns[-1]

'NNI_cat3_categories'

In [37]:
test_data_df.loc[:,f"{region_name}_{target_type}"] = test_data.iloc[:,-1]

### Now apply the prediction method the `lightgbm` model to the test data 

In [38]:
y_hat = lg.predict(test_data_df.iloc[:,:-1])

In [39]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.56

In [40]:
blend_all = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5,0.0,0.4778,0.4875,0.4838,0.1935
1,0.5,0.0,0.4481,0.4697,0.4726,0.2095
2,0.5,0.0,0.4185,0.4433,0.4599,0.1903
3,0.45,0.0,0.4407,0.435,0.4409,0.1506
4,0.65,0.0,0.637,0.635,0.6407,0.451
5,0.75,0.0,0.7593,0.775,0.7429,0.6212
6,0.65,0.0,0.5963,0.635,0.6401,0.4488
7,0.55,0.0,0.5222,0.545,0.5453,0.2913
8,0.65,0.0,0.5778,0.6282,0.6132,0.4262
9,0.65,0.0,0.5595,0.6215,0.6098,0.4167


In [41]:
y_hat = blend_all.predict(test_data_df.iloc[:,:-1])

In [42]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.72

### third approach: blend the 3 best models (note that cannot use catboost)

In [43]:
df_compare_models.data.loc[0:5,:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.6,0.0,0.5384,0.5777,0.5602,0.3457
1,Quadratic Discriminant Analysis,0.59,0.0,0.5326,0.5745,0.5632,0.3413
2,CatBoost Classifier,0.58,0.0,0.5274,0.5531,0.5519,0.321
3,Extreme Gradient Boosting,0.57,0.0,0.5327,0.5613,0.5531,0.3187
4,Light Gradient Boosting Machine,0.57,0.0,0.5283,0.5477,0.5414,0.3176
5,SVM - Linear Kernel,0.565,0.0,0.5272,0.5666,0.5452,0.3193


### first step: create models 

In [44]:
lg = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.55,0.0,0.4963,0.4275,0.4667,0.2469
1,0.5,0.0,0.4481,0.4625,0.4667,0.1968
2,0.4,0.0,0.2963,0.2571,0.313,0.0
3,0.45,0.0,0.4407,0.4562,0.4511,0.1603
4,0.7,0.0,0.6741,0.7269,0.6828,0.5102
5,0.65,0.0,0.6259,0.6723,0.659,0.4636
6,0.5,0.0,0.4481,0.4958,0.4659,0.2
7,0.4,0.0,0.4037,0.45,0.4,0.1176
8,0.75,0.0,0.7111,0.7348,0.7202,0.6047
9,0.8,0.0,0.7381,0.7932,0.7885,0.6761


In [45]:
et = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6,0.0,0.5704,0.6,0.5867,0.3548
1,0.5,0.0,0.4481,0.4625,0.4667,0.1968
2,0.5,0.0,0.4,0.3365,0.3865,0.1489
3,0.45,0.0,0.4111,0.43,0.4388,0.1373
4,0.75,0.0,0.7111,0.7375,0.719,0.5984
5,0.8,0.0,0.7852,0.8167,0.8053,0.6947
6,0.55,0.0,0.4852,0.59,0.5095,0.2405
7,0.65,0.0,0.5852,0.635,0.6218,0.4444
8,0.6,0.0,0.5111,0.67,0.5408,0.322
9,0.6,0.0,0.4762,0.4993,0.5272,0.3191


In [46]:
xg = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.5,0.0,0.4593,0.475,0.4614,0.1701
1,0.45,0.0,0.4111,0.4205,0.43,0.127
2,0.55,0.0,0.4667,0.4198,0.449,0.234
3,0.35,0.0,0.337,0.3946,0.3679,0.0152
4,0.65,0.0,0.637,0.635,0.6407,0.451
5,0.6,0.0,0.5593,0.6036,0.5996,0.3774
6,0.6,0.0,0.5593,0.6625,0.6,0.3574
7,0.55,0.0,0.5519,0.5786,0.5562,0.3233
8,0.65,0.0,0.6074,0.635,0.6353,0.4531
9,0.8,0.0,0.7381,0.7883,0.7906,0.6787


In [47]:
blend_specific = blend_models(estimator_list = [et,xg, lg])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.55,0.0,0.4963,0.4275,0.4667,0.2469
1,0.5,0.0,0.4481,0.4625,0.4667,0.1968
2,0.55,0.0,0.4667,0.395,0.4486,0.2469
3,0.4,0.0,0.3741,0.4241,0.4109,0.0805
4,0.7,0.0,0.6741,0.6773,0.68,0.5238
5,0.65,0.0,0.6259,0.6723,0.659,0.4636
6,0.5,0.0,0.4481,0.4958,0.4659,0.2
7,0.6,0.0,0.5889,0.6595,0.6165,0.4007
8,0.7,0.0,0.6444,0.6682,0.675,0.5238
9,0.8,0.0,0.7381,0.7932,0.7885,0.6761


In [48]:
y_hat = blend_specific.predict(test_data_df.iloc[:,:-1])

In [49]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.72