In [1]:
# Parameters 

application = 'Apple_and_Pears'
varname = 'TMEAN'
target_type = 'cat3'
GCM = 'ECMWF'
# GCM = 'All'
standardized = False
# whether or not to shuffle the training data and the test data, especially recommended if GCM == 'All'
shuffle_train = True 
shuffle_test = True 

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
%matplotlib inline

In [5]:
import sys 
import pathlib

In [6]:
import matplotlib.pyplot as plt 

In [7]:
import numpy as np 
import pandas as pd 
import xarray as xr

In [8]:
np.random.seed(42)

In [9]:
import pycaret

In [12]:
pycaret.__file__

'/home/nicolasf/anaconda3/envs/pycaret/lib/python3.8/site-packages/pycaret/__init__.py'

In [None]:
from pycaret.classification import *

In [None]:
df_models = models()

In [None]:
df_models

In [None]:
HOME = pathlib.Path.home()

In [None]:
sys.path.append('/home/nicolasf/research/Smart_Ideas/code/ml4seas')

### read the concatenated target and GCM data 

In [None]:
data = pd.read_csv('./data/CSV_Apple_and_Pears_TMEAN_cat3_ECMWF.csv', index_col=0, parse_dates=True)

In [None]:
data.head()

In [None]:
train_data = data.loc[None:'2010-12',:]

In [None]:
train_data.head()

In [None]:
train_data.tail()

In [None]:
test_data = data.loc['2010-04':None, :]

In [None]:
test_data.head()

### scale and PCA on the training data 

In [None]:
from sklearn.preprocessing import StandardScaler as scaler 
from sklearn.decomposition import PCA

In [None]:
scaler = scaler()

In [None]:
X = scaler.fit_transform(train_data.iloc[:,:-1])

In [None]:
PCA = PCA(n_components=0.9, random_state=42)

In [None]:
X = PCA.fit_transform(X)

In [None]:
X.shape

In [None]:
df_train = pd.DataFrame(X, index=train_data.index, columns=[f"PC{i}" for i in range(1, X.shape[1]+1)])

In [None]:
df_train = pd.concat([df_train, train_data.iloc[:,-1]], axis=1)

In [None]:
df_train.columns

### then shuffle 

In [None]:
if shuffle_train: 
    df_train = df_train.sample(frac=1., random_state=42, axis=0)

In [None]:
X_test = scaler.transform(test_data.iloc[:,:-1])

In [None]:
X_test = PCA.transform(X_test)

In [None]:
X_test.shape

In [None]:
df_test = pd.DataFrame(X_test, index=test_data.index, columns=[f"PC{i}" for i in range(1, X_test.shape[1]+1)])

In [None]:
df_test = pd.concat([df_test, test_data.iloc[:,-1]], axis=1)

In [None]:
df_test.head()

In [None]:
df_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler as scaler_pc

In [None]:
scaler_pc = scaler_pc()

In [None]:
PCs_train = scaler_pc.fit_transform(df_train.iloc[:,:-1])

In [None]:
PCs_train

In [None]:
df_train.iloc[:,:-1] = PCs_train

In [None]:
PCs_test = scaler_pc.transform(df_test.iloc[:,:-1])

In [None]:
df_test.iloc[:,:-1] = PCs_test

In [None]:
silent = True

In [None]:
log_name = f"{GCM}_1981_2010_pred_{application}_{varname}_{target_type}_target_std_{str(standardized)}_suffle_train_{str(shuffle_train)}"

In [None]:
print(f"experiment on MLFlow in {log_name}")

In [None]:
exp_clf = setup(data = df_train, \
                target = f'{target_type}', \
                session_id=123, \
                log_experiment=True, \
                experiment_name=log_name, \
                normalize = False, \
                transformation = False, \
                silent=silent)

In [None]:
top5 = compare_models(turbo=True, n_select=5) 

In [None]:
table_top5 = pull()

In [None]:
table_top5

boosting (CatBoost Classifier, Extreme Gradient Boosting, Extra Trees Classifier, Light Gradient Boosting Machine, Gradient Boosting Classifier) 
all seem to perform quite well ....

### Blend all models 

In [None]:
blender = blend_models(method='soft')

### select the cat boost (See https://catboost.ai/) 

In [None]:
catboost = create_model('catboost')

### tune the catboost model

In [None]:
tuned_catboost = tune_model(catboost)

In [None]:
def score(model): 
    try: 
        return model.score(df_test.iloc[:,:-1], df_test.iloc[:,-1])
    except: 
        return np.nan

In [None]:
score(catboost)

In [None]:
catboost.predict(df_test.iloc[:,:-1])

In [None]:
df_test.iloc[:,-1]

### using catboost directly from the library 

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [None]:
df_train.iloc[:,-1].plot()

In [13]:
df_test.iloc[:,-1].plot()

NameError: name 'df_test' is not defined

### Now save the notebook in HTML and rename to reflect the parameters of the experiment 

In [None]:
notebook_name='Apple_and_Pears.ipynb'

In [None]:
!jupyter nbconvert --to html {notebook_name}

In [None]:
savepath = pathlib.Path('./saved_notebooks/')

In [None]:
if not savepath.exists(): 
    savepath.mkdir(parents=True)

In [None]:
output_name = savepath.joinpath(f"pycaret_classification_{log_name}.html")

In [None]:
pathlib.Path('./pycaret_v2_wPCA.html').rename(output_name)