In [1]:
# Parameters 

region_name = 'NNI'
var_name = 'TMEAN'
target_type = 'cat3_categories'
# GCM = 'UKMO'
GCM = 'All'
standardized = False

In [2]:
%matplotlib inline

In [3]:
import os 
import sys 
import pathlib

In [4]:
HOME = pathlib.Path.home()

In [5]:
from matplotlib import pyplot as plt

In [6]:
import numpy as np 
import pandas as pd

In [7]:
from pycaret import datasets

In [8]:
from pycaret.classification import *

In [9]:
sys.path.append('/home/nicolasf/research/Smart_Ideas/code/ml4seas')

In [10]:
from evaluation import calc_accuracy_sco

In [11]:
dpath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [12]:
list(dpath.glob("*.csv"))

[PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_training_set.csv')]

In [13]:
if standardized: 
    train_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)
else: 
    train_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)    

### small function that prepares the data for ingestion in `PyCARET` 

In [14]:
# %%writefile ../../../../ml4seas/GCM/prepare_data_CSV_to_CARET.py
def prepare_data_CSV_to_CARET(data, gcm_index=-13, GCM='All', region_name=None, target_type=None, scaling=True, doPCA=True, n_components=0.9): 
    """
    Small function that prepares the data initially contained in the 
    processed CSV files for ingestion into PyCARET
    
    Arguments
    --------- 
    
    - gcm_index : negative index (always) indicating what 
            is the last column containing the predictor variable 
            default is -12 for the 'ext_regional' geographical 
            domain 
    - GCM: 'All' or specific GCM in ['CMCC', 'CanCM4i', 'CanSIPSv2', 
                                    'DWD', 'ECMWF', 'GEM_NEMO', 'JMA',
                                    'METEO_FRANCE', 'NASA_GEOSS2S', 'NCEP_CFSv2', 'UKMO']
    - region_name : ['NNI','WNI','ENI','NSI','WSI','ESI']
    - target_type : 'cat3_category' or 'anomalies'
    
    Return
    ------
    
    - data : the data filtered by GCM (if not 'All'), region_name and target type 
    - GCM_index: the GCMs index 

    """
    
    import numpy as np 
    import pandas as pd 
    from sklearn.preprocessing import  StandardScaler 
    from sklearn.decomposition import PCA

    # extract one GCM if not 'All'
    if GCM != 'All': 
        data = data.query(f"GCM == '{GCM}'") 
    
    # GCM name and associated index
    GCMs_name = data.loc[:,['GCM']]  
    
    # GCM (features)
    GCM_data = data.iloc[:,0:gcm_index]
    
    # associated index
    index = GCM_data.index
    
    # associated column names 
    cols = GCM_data.columns

    # target variable 
    target = data.loc[:,[f'{region_name}_{target_type}']]
    
    # get the values for X 
    X = GCM_data.values
    
    if scaling:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else: 
        scaler = None
    
    if doPCA: 
        pca = PCA(n_components=n_components)
        pca = pca.fit(X)
        X = pca.transform(X)
        npcs = X.shape[1]
    else: 
        pca = None

    # now casts X into a DataFrame
    if doPCA: 
        df = pd.DataFrame(X, index=index, columns=[f"PC{i}" for i in range(1, npcs+1)])
    else: 
        df = pd.DataFrame(GCM_data.values, index=index, columns=cols)
        
    # add the target variable 
    data = pd.concat([df, target], axis=1)
    
    return data, GCMs_name, scaler, pca

In [15]:
train_data, GCMs_name_train, scaler, pca = prepare_data_CSV_to_CARET(train_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=True, doPCA=True)

In [16]:
train_data.shape

(3982, 61)

In [17]:
train_data.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28',
       'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37',
       'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46',
       'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55',
       'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'NNI_cat3_categories'],
      dtype='object')

In [18]:
test_data, GCMs_name_test, _, _ = prepare_data_CSV_to_CARET(test_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=False, doPCA=False)

In [19]:
test_data.shape

(303, 4930)

In [20]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
pca

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

### set up the PYCARET experiment, use only the training set (cross validation will be used to evaluate the models)

### Note that the first time around, the argument `silent` is left to the default (False) so that the user can verify the data type of each column after the first run and if data types correctly interpreted, `silent` can be set to True

In [22]:
silent = True

In [23]:
exp_clf = setup(data = train_data, target = f'{region_name}_{target_type}', session_id=123,
                  normalize = False, 
                  transformation = False, 
                  pca=False, silent=silent)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(3982, 61)"
4,Missing Values,False
5,Numeric Features,60
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### quick comparison of the models, if `turbo` is set to False, then all models (including the ones expensive to train) are considered 

In [24]:
df_compare_models = compare_models(turbo=False)

In [25]:
df_compare_models.data

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,K Neighbors Classifier,0.7811,0.0,0.7732,0.782,0.7805,0.6668
1,MLP Classifier,0.7754,0.0,0.7687,0.7757,0.7749,0.6592
2,CatBoost Classifier,0.7485,0.0,0.738,0.7463,0.7452,0.6167
3,Light Gradient Boosting Machine,0.7374,0.0,0.7274,0.7353,0.7346,0.6003
4,Extra Trees Classifier,0.7305,0.0,0.7147,0.7306,0.724,0.5859
5,Quadratic Discriminant Analysis,0.7079,0.0,0.7094,0.7248,0.7116,0.5616
6,Gradient Boosting Classifier,0.6717,0.0,0.6573,0.6667,0.6663,0.4986
7,Extreme Gradient Boosting,0.6663,0.0,0.6532,0.6613,0.6605,0.4907
8,Random Forest Classifier,0.6329,0.0,0.6116,0.6291,0.6233,0.4342
9,Ridge Classifier,0.6021,0.0,0.5847,0.5915,0.591,0.3898


In [26]:
df_compare_models.data.loc[[0],:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,K Neighbors Classifier,0.7811,0.0,0.7732,0.782,0.7805,0.6668


In [27]:
methods = pd.read_csv('./methods.csv', index_col=0)

In [28]:
best_model = methods.loc[df_compare_models.data.loc[[0],'Model'],'Abbr'].values[0]

In [29]:
print(f"the best model is {best_model}")

the best model is knn


In [30]:
best_model_m = create_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7384,0.0,0.7263,0.7374,0.7373,0.6017
1,0.767,0.0,0.7602,0.7714,0.7675,0.6449
2,0.7778,0.0,0.7725,0.7782,0.7777,0.6621
3,0.7993,0.0,0.7926,0.7998,0.7992,0.6948
4,0.8065,0.0,0.8007,0.8051,0.8048,0.7051
5,0.8244,0.0,0.8154,0.8235,0.8232,0.7324
6,0.767,0.0,0.7595,0.7723,0.7682,0.6464
7,0.7806,0.0,0.7675,0.7765,0.7772,0.6656
8,0.7518,0.0,0.7404,0.7559,0.7507,0.6207
9,0.7986,0.0,0.7968,0.7999,0.7991,0.6944


### first pathway: tune the best model 

In [31]:
tuned_best_model = tune_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7384,0.0,0.7268,0.7389,0.7374,0.6012
1,0.7742,0.0,0.7724,0.7763,0.775,0.6572
2,0.7993,0.0,0.792,0.7985,0.7982,0.6942
3,0.8029,0.0,0.7952,0.8025,0.8019,0.6997
4,0.7921,0.0,0.7805,0.7908,0.7899,0.6826
5,0.7634,0.0,0.7532,0.7627,0.7615,0.6388
6,0.767,0.0,0.7641,0.7727,0.7692,0.6476
7,0.8094,0.0,0.8019,0.8108,0.8094,0.7102
8,0.7914,0.0,0.7803,0.7903,0.7866,0.681
9,0.7734,0.0,0.7728,0.772,0.7725,0.6561


In [32]:
test_data.shape

(303, 4930)

In [33]:
test_data.head()

Unnamed: 0,"(-70.0, 70.0)","(-70.0, 72.5)","(-70.0, 75.0)","(-70.0, 77.5)","(-70.0, 80.0)","(-70.0, 82.5)","(-70.0, 85.0)","(-70.0, 87.5)","(-70.0, 90.0)","(-70.0, 92.5)",...,"(60.0, 280.0)","(60.0, 282.5)","(60.0, 285.0)","(60.0, 287.5)","(60.0, 290.0)","(60.0, 292.5)","(60.0, 295.0)","(60.0, 297.5)","(60.0, 300.0)",NNI_cat3_categories
2017-04-30,0.350412,0.609415,0.402756,0.494441,0.545367,0.577342,0.595652,0.551609,0.509443,0.442834,...,-0.187773,-0.346976,-0.421439,-0.445218,-0.466354,-0.518676,-0.528607,-0.416464,-0.623176,3.0
2017-05-31,0.167277,0.057477,-0.028681,0.093749,0.113651,0.133755,0.125145,0.153855,0.207044,0.253551,...,0.641738,0.679107,0.567584,0.559127,0.376234,0.25422,0.37983,0.270595,0.05505,3.0
2017-06-30,-0.037492,-0.200014,-0.077527,0.049841,0.124295,0.196644,0.237064,0.281379,0.293671,0.282525,...,-0.026262,-0.112167,-0.145194,-0.129408,-0.066313,-0.177321,-0.259453,-0.087841,-0.179672,3.0
2017-07-31,0.24102,0.126914,0.265512,0.330046,0.428405,0.540412,0.607662,0.652604,0.647708,0.65447,...,0.183818,0.04867,-0.025598,0.03396,-0.108763,-0.028818,0.147046,0.062604,0.05951,2.0
2017-08-31,0.347808,0.32761,0.384274,0.453615,0.468264,0.500675,0.520086,0.53589,0.554246,0.594686,...,-0.206735,-0.482101,-0.51986,-0.337142,-0.322366,-0.332399,-0.375129,-0.291196,-0.264078,3.0


In [34]:
test_data.mean()

(-70.0, 70.0)          0.168105
(-70.0, 72.5)          0.090345
(-70.0, 75.0)          0.301747
(-70.0, 77.5)          0.382694
(-70.0, 80.0)          0.204185
                         ...   
(60.0, 292.5)         -0.022449
(60.0, 295.0)         -0.132095
(60.0, 297.5)         -0.130461
(60.0, 300.0)          0.028341
NNI_cat3_categories    2.914191
Length: 4930, dtype: float64

### transform the test data first (standardization using the scaler 'fitted' on the training data)

In [35]:
x = scaler.transform(test_data.drop(f"{region_name}_{target_type}", axis=1))

### now perform PCA on scaled GCM outputs in the test set 

In [36]:
x = pca.transform(x)

In [37]:
x.shape

(303, 60)

In [38]:
train_data.shape

(3982, 61)

### make a dataframe using the indices of test data and columns of train_data 

In [39]:
test_data_df = pd.DataFrame(x, index=test_data.index, columns=train_data.columns[:-1])

In [40]:
test_data_df.shape

(303, 60)

### and now add the target column 

In [41]:
test_data.columns[-1]

'NNI_cat3_categories'

In [42]:
test_data_df.loc[:,f"{region_name}_{target_type}"] = test_data.iloc[:,-1]

### Now apply the prediction method the "best model" model to the test data 

In [43]:
y_hat = best_model_m.predict(test_data_df.iloc[:,:-1])

In [44]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.5874587458745875

### Now blend all the models 

In [45]:
blend_all = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6559,0.0,0.6423,0.6547,0.6542,0.4752
1,0.7455,0.0,0.7375,0.7481,0.7455,0.6128
2,0.6738,0.0,0.6621,0.6686,0.6693,0.5048
3,0.6953,0.0,0.6895,0.6934,0.6936,0.5382
4,0.7061,0.0,0.6962,0.7,0.7014,0.5531
5,0.681,0.0,0.6672,0.6772,0.6787,0.5144
6,0.6918,0.0,0.6844,0.6946,0.6928,0.534
7,0.6727,0.0,0.6603,0.6666,0.6685,0.5019
8,0.6835,0.0,0.6717,0.6823,0.678,0.5194
9,0.7122,0.0,0.7062,0.712,0.7102,0.5648


In [46]:
y_hat = blend_all.predict(test_data_df.iloc[:,:-1])

In [47]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

0.6072607260726073

### third approach: blend the 3 best models (note that cannot use catboost)

In [48]:
df_compare_models.data.loc[0:5,:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,K Neighbors Classifier,0.7811,0.0,0.7732,0.782,0.7805,0.6668
1,MLP Classifier,0.7754,0.0,0.7687,0.7757,0.7749,0.6592
2,CatBoost Classifier,0.7485,0.0,0.738,0.7463,0.7452,0.6167
3,Light Gradient Boosting Machine,0.7374,0.0,0.7274,0.7353,0.7346,0.6003
4,Extra Trees Classifier,0.7305,0.0,0.7147,0.7306,0.724,0.5859
5,Quadratic Discriminant Analysis,0.7079,0.0,0.7094,0.7248,0.7116,0.5616


In [76]:
df_compare_models.data.shape

(18, 7)

### build the list of best N models 

In [102]:
n_models = 5

In [103]:
list_models = []
blend_models_names = []

In [104]:
i = 0
while len(list_models) < n_models: 
    mod_name = methods.loc[df_compare_models.data.loc[i, 'Model'],'Abbr'] 
    print(f"model ranked {i+1} is {mod_name} ====================\n")
    if mod_name != 'catboost': 
        print(f"building model: {mod_name}")
        mod =  create_model(mod_name)
        list_models.append(mod)
        blend_models_names.append(mod_name)
    else: 
        pass
    i += 1

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6918,0.0,0.6925,0.7048,0.6948,0.5373
1,0.7061,0.0,0.7102,0.7197,0.7093,0.5579
2,0.7061,0.0,0.7096,0.7316,0.7111,0.5603
3,0.724,0.0,0.7238,0.7396,0.7277,0.5852
4,0.7204,0.0,0.7191,0.73,0.7228,0.5796
5,0.7025,0.0,0.7032,0.7216,0.7073,0.5539
6,0.7097,0.0,0.7144,0.7396,0.7143,0.5656
7,0.6691,0.0,0.6704,0.6882,0.674,0.504
8,0.7338,0.0,0.7312,0.749,0.7381,0.599
9,0.7158,0.0,0.7191,0.7241,0.7163,0.5735


### soft voting enable the use of the `predict_proba` method for the blended ensemble 

#### names of the models in the blend

In [105]:
print(f"models in the blend: {', '.join(blend_models_names)}")

models in the blend: knn, mlp, lightgbm, et, qda


In [None]:
blend_specific = blend_models(estimator_list = list_models, method='soft')

IntProgress(value=0, description='Processing: ', max=14)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7634,0,0.7546,0.766,0.7644,0.641
1,0.8029,0,0.8002,0.808,0.8045,0.7014


In [None]:
y_hat = blend_specific.predict(test_data_df.iloc[:,:-1])

In [None]:
y_hat_probs = blend_specific.predict_proba(test_data_df.iloc[:,:-1])

In [None]:
(test_data.iloc[:,-1].values == y_hat).sum() / len(y_hat)

### Now calculate the majority vote 

### and calculate the "SCO" version of the accuracy over the test period, need to use the `predict_proba` method

In [None]:
df_verif = test_data.iloc[:,[-1]]

In [None]:
df_verif.columns = [0]

In [None]:
y_hat_probs_df = pd.DataFrame(y_hat_probs, index=df_verif.index, columns=[1,2,3])

In [None]:
y_hat_probs_df = y_hat_probs_df * 100.

In [None]:
df_verif = pd.concat([df_verif, y_hat_probs_df], axis=1)

In [None]:
df_verif.loc[:,0] = df_verif.loc[:,0].astype(np.int)

In [None]:
sco_acc = calc_accuracy_sco(df_verif, tolerance=True)

In [None]:
sco_acc

In [None]:
len(df_verif.index.unique())

In [None]:
y_hat.shape

In [None]:
df_verif.shape

In [None]:
df_verif.loc[:,4] = y_hat

In [None]:
df_verif = df_verif.dropna()

In [None]:
df_verif.index.unique().shape

In [None]:
maj_ = []
for date in df_verif.index.unique(): 
    print(f"{date:%Y-%m} number of available GCMs: {len(df_verif.loc[date,:])}")
    print(df_verif.loc[date,4].mode())
    maj_.append(df_verif.loc[date,4].mode()[0])

In [None]:
maj = np.array(maj_)

In [None]:
len(maj)

In [None]:
df = df_verif.loc[~df_verif.index.duplicated(keep='first')]

In [None]:
(df.loc[:,0].values == maj).sum() / len(df)