In [1]:
# Parameters 

region_name = 'NNI'
var_name = 'TMEAN'
target_type = 'cat3_categories'
GCM = 'NCEP_CFSv2'
# GCM = 'All'
standardized = False

### check first what the executable is 

In [2]:
import sys 
print(sys.executable)

/home/nicolasf/anaconda3/envs/ML/bin/python


In [3]:
%matplotlib inline

In [4]:
import pathlib

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import numpy as np 
import pandas as pd

In [8]:
from pycaret import datasets

In [9]:
from pycaret.classification import *

In [10]:
sys.path.append('/home/nicolasf/research/Smart_Ideas/code/ml4seas')

In [11]:
from evaluation import calc_accuracy_sco
from GCM import prepare_data_CSV_to_CARET

In [12]:
dpath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [13]:
list(dpath.glob("*.csv"))

[PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_training_set.csv')]

In [14]:
if standardized: 
    train_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)
else: 
    train_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)    

### get the training data 

In [15]:
train_data, GCMs_name_train, scaler, pca = prepare_data_CSV_to_CARET(train_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=True, doPCA=True)

In [16]:
train_data.shape

(420, 35)

In [17]:
train_data.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10',
       'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19',
       'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28',
       'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'NNI_cat3_categories'],
      dtype='object')

### get the test data  

In [18]:
test_data, GCMs_name_test, _, _ = prepare_data_CSV_to_CARET(test_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=False, doPCA=False)

In [19]:
test_data.shape

(33, 4930)

### the scalers and pca objects trained on the training data are available, for transformation of the test data (avoid information leakage)

In [20]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
pca

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

### set up the PYCARET experiment, use only the training set (cross validation will be used to evaluate the models)

### Note that the first time around, the argument `silent` is left to the default (False) so that the user can verify the data type of each column after the first run and if data types correctly interpreted, `silent` can be set to True

In [22]:
silent = False

In [23]:
exp_clf = setup(data = train_data, target = f'{region_name}_{target_type}', session_id=123,
                  normalize = False, 
                  transformation = False, 
                  pca=False, silent=silent)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(420, 35)"
4,Missing Values,False
5,Numeric Features,34
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### quick comparison of the models, if `turbo` is set to False, then all models (including the ones expensive to train) are considered 

In [24]:
df_compare_models = compare_models(turbo=False)

In [25]:
df_compare_models.data

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,MLP Classifier,0.6141,0.0,0.6031,0.6086,0.6027,0.4167
1,Extra Trees Classifier,0.594,0.0,0.5801,0.5641,0.5697,0.3823
2,CatBoost Classifier,0.5533,0.0,0.5403,0.5403,0.5386,0.324
3,Quadratic Discriminant Analysis,0.5501,0.0,0.5423,0.5474,0.5405,0.3229
4,K Neighbors Classifier,0.5353,0.0,0.5201,0.5171,0.514,0.2921
5,Light Gradient Boosting Machine,0.5153,0.0,0.505,0.5112,0.5066,0.2672
6,Linear Discriminant Analysis,0.5086,0.0,0.4977,0.4895,0.4934,0.2579
7,Ridge Classifier,0.5085,0.0,0.4967,0.4912,0.4896,0.2565
8,Random Forest Classifier,0.5025,0.0,0.488,0.4906,0.4822,0.2402
9,Logistic Regression,0.5018,0.0,0.4893,0.4859,0.486,0.2474


In [26]:
df_compare_models.data.loc[[0],:]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,MLP Classifier,0.6141,0.0,0.6031,0.6086,0.6027,0.4167


In [27]:
methods = pd.read_csv('./methods.csv', index_col=0)

In [28]:
best_model = methods.loc[df_compare_models.data.loc[[0],'Model'],'Abbr'].values[0]

In [29]:
print(f"the best model is {best_model}")

the best model is mlp


In [30]:
best_model_m = create_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7333,0.0,0.7283,0.7333,0.7333,0.5987
1,0.5333,0.0,0.5269,0.5437,0.5348,0.2953
2,0.6333,0.0,0.6239,0.6378,0.6327,0.4482
3,0.5517,0.0,0.5253,0.5252,0.5215,0.312
4,0.4483,0.0,0.4478,0.449,0.4448,0.1744
5,0.8276,0.0,0.8148,0.8641,0.8047,0.7383
6,0.5172,0.0,0.4882,0.4571,0.4785,0.2645
7,0.6207,0.0,0.6061,0.6124,0.6121,0.4242
8,0.5517,0.0,0.5522,0.5415,0.5435,0.3268
9,0.7241,0.0,0.7172,0.7216,0.7216,0.5842


### first pathway: tune the best model 

In [33]:
tuned_best_model = tune_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7667,0.0,0.766,0.7601,0.7543,0.6482
1,0.5333,0.0,0.5343,0.5262,0.5224,0.3012
2,0.6667,0.0,0.6579,0.6625,0.6634,0.4966
3,0.5517,0.0,0.5253,0.5115,0.5206,0.3145
4,0.5172,0.0,0.5084,0.5124,0.5121,0.275
5,0.7586,0.0,0.7407,0.7635,0.7398,0.6322
6,0.4138,0.0,0.3973,0.398,0.4053,0.1133
7,0.7586,0.0,0.7475,0.7709,0.7517,0.6322
8,0.6207,0.0,0.6195,0.6207,0.6207,0.4283
9,0.6552,0.0,0.6566,0.6674,0.6575,0.484


In [32]:
# tuned_best_model = tune_model(best_model, ensemble=True, method='Bagging')

### finalize the tuned best model

In [None]:
finalized_tune_best_model = finalize_model(tuned_best_model)

### Now prediction on the test data 

In [None]:
test_data.shape

In [None]:
test_data.head()

### transform the test data first (standardization using the scaler 'fitted' on the training data)

In [None]:
x = scaler.transform(test_data.iloc[:,:-1]) 

### now perform PCA on scaled GCM outputs in the test set 

In [None]:
x = pca.transform(x)

In [None]:
x.shape

In [None]:
train_data.shape

### make a dataframe using the indices of test data and columns of train_data 

In [None]:
test_data_df = pd.DataFrame(x, index=test_data.index, columns=train_data.columns[:-1])

In [None]:
test_data_df.shape

### and now add the target column 

In [None]:
test_data.columns[-1]

In [None]:
test_data_df.loc[:,f"{region_name}_{target_type}"] = test_data.loc[:,f"{region_name}_{target_type}"]

### Now score the best tuned learner on the test data 

In [None]:
finalized_tune_best_model.score(test_data_df.iloc[:,:-1], test_data.iloc[:,-1])

### Now blend all the models 

In [None]:
blend_all = blend_models(method='soft', turbo=False)

In [None]:
finalized_blend_all = finalize_model(blend_all)

In [None]:
finalized_blend_all.score(test_data_df.iloc[:,:-1], test_data.iloc[:,-1])

### third approach: blend the N best models (note that cannot use catboost)

In [None]:
df_compare_models.data.loc[0:5,:]

In [None]:
df_compare_models.data.shape

### build the list of best N models 

In [None]:
n_models = 3

In [None]:
list_models = []
blend_models_names = []

In [None]:
tune = False 

In [None]:
i = 0
while len(list_models) < n_models: 
    mod_name = methods.loc[df_compare_models.data.loc[i, 'Model'],'Abbr'] 
    print(f"model ranked {i+1} is {mod_name} ====================\n")
    if mod_name != 'catboost': 
        print(f"building model: {mod_name}")
        if tune: 
            mod = tune_model(mod_name, ensemble=True, method='Bagging')
        else: 
            mod =  create_model(mod_name)
        list_models.append(mod)
        blend_models_names.append(mod_name)
    else: 
        print(f"skipping catboost ... ")
        pass
    i += 1

### soft voting enable the use of the `predict_proba` method for the blended ensemble 

#### names of the models in the blend

In [None]:
print(f"models in the blend: {', '.join(blend_models_names)}")

In [None]:
blend_specific = blend_models(estimator_list = list_models, method='soft', turbo=False)

### finalize the blend specific model 

In [None]:
finalized_blend_specific = finalize_model(blend_specific)

In [None]:
finalized_blend_specific.score(test_data_df.iloc[:,:-1], test_data.iloc[:,-1])

### selects the best model overall 

In [None]:
y_hat_probs = finalized_tune_best_model.predict_proba(test_data_df.iloc[:,:-1])

In [None]:
df_verif = test_data.iloc[:,[-1]]

In [None]:
df_verif.columns = [0]

In [None]:
y_hat_probs_df = pd.DataFrame(y_hat_probs, index=df_verif.index, columns=[1,2,3])

In [None]:
y_hat_probs_df = y_hat_probs_df * 100.

In [None]:
df_verif = pd.concat([df_verif, y_hat_probs_df], axis=1)

In [None]:
df_verif.head()

In [None]:
df_verif.loc[:,0] = df_verif.loc[:,0].astype(np.int)

In [None]:
sco_acc = calc_accuracy_sco(df_verif, tolerance=True)

In [None]:
sco_acc

### if we have trained over all the GCMs, then the instances are repeated in time, we deal with that by computing the mode (~= majority vote)

In [None]:
if GCM == 'All':
    maj_ = []
    for date in df_verif.index.unique(): 
        print(f"{date:%Y-%m} number of available GCMs: {len(df_verif.loc[date,:])}")
#         print(df_verif.loc[date,4].mode())
        maj_.append(df_verif.loc[date,4].mode()[0])

    maj = np.array(maj_)
    
    df = df_verif.loc[~df_verif.index.duplicated(keep='first')]
    
    (df.loc[:,0].values == maj).sum() / len(df)

In [None]:
import ppscore as pps

In [None]:
train_data.columns

In [None]:
ppscore = []
for pc in train_data.columns[:-1]: 
    ppscore.append(pps.score(train_data, pc, "NNI_cat3_categories")['ppscore']) 