In [1]:
# Parameters 

region_name = 'WSI'
var_name = 'TMEAN'
target_type = 'cat3_categories'
GCM = 'ECMWF'
# GCM = 'All'
standardized = False
# whether or not to shuffle the training data, especially recommended if GCM == 'All'
shuffle_train = True 

### check first what the executable is 

In [2]:
import sys 
print(sys.executable)

/home/nicolasf/anaconda3/envs/pycaret/bin/python


In [3]:
%matplotlib inline

In [4]:
import pathlib

In [5]:
HOME = pathlib.Path.home()

In [6]:
from matplotlib import pyplot as plt

In [7]:
import numpy as np 
import pandas as pd

In [8]:
import pycaret

In [9]:
pycaret.__version__

'2.0.0'

In [10]:
from pycaret import datasets

In [11]:
from pycaret.classification import *

### list of models

In [12]:
df_models = models()

In [13]:
df_models

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors.KNeighborsClassifier,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model.SGDClassifier,True
rbfsvm,SVM - Radial Kernel,sklearn.svm.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process.GPC,False
mlp,MLP Classifier,sklearn.neural_network.MLPClassifier,False
ridge,Ridge Classifier,sklearn.linear_model.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble.RandomForestClassifier,True


### add the local utilities 

In [14]:
sys.path.append('/home/nicolasf/research/Smart_Ideas/code/ml4seas')

In [15]:
from evaluation import calc_accuracy_sco
from GCM import prepare_data_CSV_to_CARET

xesmf is not installed, using method `interp_like` for interpolation


In [16]:
dpath = HOME / 'research' / 'Smart_Ideas' / 'outputs' / 'CSVs'

In [17]:
list(dpath.glob("*.csv"))

[PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_std_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_training_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_TMEAN_test_set.csv'),
 PosixPath('/home/nicolasf/research/Smart_Ideas/outputs/CSVs/GCMs_and_targets_cat3_and_anomalies_RAIN_test_set.csv')]

In [18]:
if standardized: 
    train_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_std_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)
else: 
    train_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_training_set.csv', index_col=0, parse_dates=True) 
    test_data = pd.read_csv(dpath / f'GCMs_and_targets_cat3_and_anomalies_{var_name}_test_set.csv', index_col=0, parse_dates=True)    

### get the training data 

### first run and return just the scaler and the pca fitted on the training data 

In [19]:
_, _, scaler, pca = prepare_data_CSV_to_CARET(train_data, \
                                             GCM=GCM, \
                                             region_name=region_name, \
                                             target_type=target_type, \
                                             scaling=True, \
                                             doPCA=True)

### then return the raw data 

In [20]:
train_data, GCMs_name_train, _, _ = prepare_data_CSV_to_CARET(train_data, \
                                                             GCM=GCM, \
                                                             region_name=region_name, \
                                                             target_type=target_type, \
                                                             scaling=False, \
                                                             doPCA=False)

### check the number of missing months if any 

In [21]:
if GCM != 'All': 
    if len(pd.date_range(start=train_data.index[0], end=train_data.index[-1], freq='M')) == len(train_data): 
        print("Length of record consistent")
    else:
        print("Length of record not consistent, check the data")

Length of record consistent


### whether or not to shuffle the training data, especially recommended if GCM == 'All'

In [22]:
shuffle_train

True

In [23]:
if shuffle_train: 
    train_data = train_data.sample(frac=1., random_state=42, axis=0)

### get the test data, no scaling or PCA 

In [24]:
test_data, GCMs_name_test, _, _ = prepare_data_CSV_to_CARET(test_data, GCM=GCM, region_name=region_name, target_type=target_type, scaling=False, doPCA=False)

In [25]:
test_data.shape

(33, 4930)

### the scalers and pca objects trained on the training data are available, for transformation of the test data (avoid information leakage)

In [26]:
scaler

StandardScaler()

In [27]:
pca

PCA(n_components=0.9)

### set up the test data set 

In [28]:
test_data.shape

(33, 4930)

In [29]:
test_data.head()

Unnamed: 0,"(-70.0, 70.0)","(-70.0, 72.5)","(-70.0, 75.0)","(-70.0, 77.5)","(-70.0, 80.0)","(-70.0, 82.5)","(-70.0, 85.0)","(-70.0, 87.5)","(-70.0, 90.0)","(-70.0, 92.5)",...,"(60.0, 280.0)","(60.0, 282.5)","(60.0, 285.0)","(60.0, 287.5)","(60.0, 290.0)","(60.0, 292.5)","(60.0, 295.0)","(60.0, 297.5)","(60.0, 300.0)",WSI_cat3_categories
2017-04-30,0.350412,0.609415,0.402756,0.494441,0.545367,0.577342,0.595652,0.551609,0.509443,0.442834,...,-0.187773,-0.346976,-0.421439,-0.445218,-0.466354,-0.518676,-0.528607,-0.416464,-0.623176,2.0
2017-05-31,0.167277,0.057477,-0.028681,0.093749,0.113651,0.133755,0.125145,0.153855,0.207044,0.253551,...,0.641738,0.679107,0.567584,0.559127,0.376234,0.25422,0.37983,0.270595,0.05505,2.0
2017-06-30,-0.037492,-0.200014,-0.077527,0.049841,0.124295,0.196644,0.237064,0.281379,0.293671,0.282525,...,-0.026262,-0.112167,-0.145194,-0.129408,-0.066313,-0.177321,-0.259453,-0.087841,-0.179672,2.0
2017-07-31,0.24102,0.126914,0.265512,0.330046,0.428405,0.540412,0.607662,0.652604,0.647708,0.65447,...,0.183818,0.04867,-0.025598,0.03396,-0.108763,-0.028818,0.147046,0.062604,0.05951,2.0
2017-08-31,0.347808,0.32761,0.384274,0.453615,0.468264,0.500675,0.520086,0.53589,0.554246,0.594686,...,-0.206735,-0.482101,-0.51986,-0.337142,-0.322366,-0.332399,-0.375129,-0.291196,-0.264078,3.0


### transform the test data first (standardization using the scaler 'fitted' on the training data)

In [30]:
x = scaler.transform(test_data.iloc[:,:-1]) 

### now perform PCA on scaled GCM outputs in the test set 

In [31]:
x = pca.transform(x)

In [32]:
x.shape

(33, 33)

In [33]:
train_data.shape

(286, 4930)

In [34]:
len(pca.components_)

33

### make a dataframe using the indices of test data and columns of train_data 

In [35]:
test_data_df = pd.DataFrame(x, index=test_data.index, columns=[f'Component_{i}' for i in range(1, len(pca.components_) + 1)]) 

In [36]:
test_data_df.shape

(33, 33)

### and now add the target column 

In [37]:
test_data.columns[-1]

'WSI_cat3_categories'

In [38]:
test_data_df.loc[:,f"{region_name}_{target_type}"] = test_data.loc[:,f"{region_name}_{target_type}"]

In [39]:
if shuffle_train: 
    test_data_df = test_data_df.sample(frac=1., random_state=42, axis=0)

### set up the PYCARET experiment, use only the training set (cross validation will be used to evaluate the models)

### Note that the first time around, the argument `silent` is left to the default (False) so that the user can verify the data type of each column after the first run and if data types correctly interpreted, `silent` can be set to True

In [40]:
# silent = False
silent = True 

In [41]:
log_name = f"{GCM}_pred_{region_name}_{var_name}_{target_type}_target_std_{str(standardized)}_suffle_train_{str(shuffle_train)}"

In [42]:
print(f"experiment on MLFlow in {log_name}")

experiment on MLFlow in ECMWF_pred_WSI_TMEAN_cat3_categories_target_std_False_suffle_train_True


In [43]:
exp_clf = setup(data = train_data, \
                target = f'{region_name}_{target_type}', \
                session_id=123, \
                log_experiment=True, \
                experiment_name=log_name, \
                normalize = True, \
                transformation = False, \
                pca=True, \
                pca_method='linear', \
                pca_components=len(pca.components_), \
                silent=silent)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(286, 4930)"
4,Missing Values,False
5,Numeric Features,4929
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### quick comparison of the models, if `turbo` is set to False, then all models (including the ones expensive to train) are considered 

### select the best 5 

In [44]:
top5 = compare_models(turbo=False, n_select=5) 

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.58,0.0,0.5536,0.5877,0.5618,0.3516,0.3652,4.0877
1,Extra Trees Classifier,0.565,0.0,0.5338,0.576,0.5421,0.3217,0.3343,0.2153
2,Light Gradient Boosting Machine,0.56,0.0,0.5381,0.5679,0.549,0.3264,0.3372,0.0852
3,Extreme Gradient Boosting,0.555,0.0,0.543,0.5731,0.549,0.3235,0.3327,0.1243
4,MLP Classifier,0.555,0.0,0.5407,0.5714,0.5486,0.3231,0.3313,1.1797
5,K Neighbors Classifier,0.545,0.0,0.5195,0.5923,0.5303,0.2924,0.3049,0.0069
6,Quadratic Discriminant Analysis,0.545,0.0,0.5185,0.558,0.5246,0.2922,0.3029,0.0078
7,Gradient Boosting Classifier,0.53,0.0,0.516,0.5388,0.5215,0.2821,0.2881,0.43
8,Logistic Regression,0.505,0.0,0.4917,0.5135,0.4967,0.2451,0.2538,0.0386
9,Linear Discriminant Analysis,0.5,0.0,0.4828,0.5028,0.4897,0.2357,0.2438,0.0095


In [45]:
table_top5 = pull()

In [46]:
len(top5)

5

In [None]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.65,0.0,0.625,0.6311,0.6385,0.4656,0.4675
1,0.5,0.0,0.4861,0.5,0.4949,0.2308,0.2335
2,0.5,0.0,0.4595,0.4733,0.4792,0.2188,0.2224
3,0.45,0.0,0.4369,0.4882,0.4603,0.1822,0.187
4,0.4,0.0,0.4012,0.4881,0.4146,0.1111,0.1163
5,0.65,0.0,0.6536,0.6217,0.6202,0.4656,0.4786
6,0.35,0.0,0.3464,0.3083,0.3064,0.0152,0.0165
7,0.5,0.0,0.4595,0.4917,0.4939,0.2337,0.2346
8,0.6,0.0,0.6119,0.58,0.5833,0.3846,0.3907
9,0.55,0.0,0.5262,0.5417,0.5435,0.305,0.3062


IntProgress(value=0, description='Processing: ', max=16)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


In [None]:
table_tuned_top5 = pull()

### score the models on the test data 

In [None]:
def score(model): 
    try: 
        return model.score(test_data_df.iloc[:,:-1], test_data.iloc[:,-1])
    except: 
        return np.nan

In [None]:
scores = []
for model in tuned_top5: 
    try:
        scores.append(score(model)) 
    except:
        print(f"could not evaluate model {model} on the test set")
        pass

In [None]:
scores

### finalize (i.e. retrain on the whole training set)

In [None]:
tuned_top5_finalized = list(map(finalize_model, tuned_top5))

In [None]:
list(map(score, tuned_top5_finalized))

### blend all models in the libary

Blending models is a method of ensembling which uses consensus among estimators to generate final predictions. The idea behind blending is to combine different machine learning algorithms and use a majority vote or the average predicted probabilities in case of classification to predict the final outcome. Blending models in PyCaret is as simple as writing blend_models. This function can be used to blend specific trained models that can be passed using estimator_list parameter within blend_models or if no list is passed, it will use all the models in model library. In case of Classification, method parameter can be used to define ‘soft‘ or ‘hard‘ where soft uses predicted probabilities for voting and hard uses predicted labels. This functions returns a table with k-fold cross validated scores of common evaluation metrics along with trained model object. The evaluation metrics used are:

In [None]:
blender = blend_models(method='soft')

In [None]:
table_blender = pull()

### score the blender on the test data 

In [None]:
score(blender)

------

### Now re-tune the best model 

In [None]:
best_model = tune_model(top5[0], choose_better=True)

In [None]:
table_best_model = pull()

### score over the test set 

In [None]:
score(best_model)

### retrain the model over the whole training set 

In [None]:
retrained_best_model = finalize_model(best_model)

In [None]:
score(retrained_best_model)

### Now save the notebook in HTML and rename to reflect the parameters of the experiment 

In [None]:
notebook_name='pycaret_v2_wPCA.ipynb'

In [None]:
!jupyter nbconvert --to html {notebook_name}

In [None]:
savepath = pathlib.Path('./saved_notebooks/')

In [None]:
if not savepath.exists(): 
    savepath.mkdir(parents=True)

In [None]:
output_name = savepath.joinpath(f"pycaret_classification_{log_name}.html")

In [None]:
pathlib.Path('./pycaret_v2_wPCA.html').rename(output_name)