# MULTIVARIATE TIME SERIES MODELING
# Agriculture and pasture land area historical serie for cities in Pará, Brazil
---
# 0. Importing Modules

In [1]:
import os
import json
import pandas as pd
import numpy as np; np.random.seed(999)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings; warnings.filterwarnings('ignore')
import tools # Custom modules
from time_series_modeling import (
    model, model_selection,
    series_model, sequence_scorer
)

---
# 1. Processing data

In [2]:
# 1.1 Loading processed data
data = pd.read_csv('data/clean/data.csv') # clean provided data
series = tools.preprocess.load_series(path='data/clean/series/') # dict of pandas series 
X = pd.read_csv('data/clean/series.csv', index_col=0) # dataframe of series as columns
# 1.2 Defining target variable name
target = 'area'
# 1.3 Extracting target values as pandas series indexed by integer encoded years
# Excluding series with less than n_min=3 values
lab_series = tools.preprocess.get_lab_series(series, data, target, index_col='year', n_min=3)
keys = list(lab_series.keys())
# 1.4 Extracting target values as pandas series indexed by integer encoded years
# Excluding series with less than n_min=2 values
# including index colum in 'lab_ind_index' dataframes to reference each observation
lab_ind_series = tools.preprocess.get_lab_indexed_series(series, data, cols=['index', target], index_col='year', n_min=3)
ind_keys = list(lab_ind_series.keys())
# 1.5 product and product type category values
prodtypes = ['permanent', 'temporary', 'pasture']
prods = [
    'Rice', 'Beans', 'Cassava', 'Corn', 'Soy', 'Sorghum', 'Cocoa',
    'Palm oil', 'Açaí', 'Livestock', 'Others-temporary', 'Others-permanent'
]
# 1.6 Extracting indexes of each product
prod_indexes = tools.category_index.get_ctgr_combs_indexes(data, prods)
# 1.7 Extracting indexes of each product type
prodtype_indexes = tools.category_index.get_ctgrs_indexes(data)

Done! Loaded 1107 files.


---
# 2. Choosing models

In [3]:
# List of selected regressors
regressors_names = [
    'RandomForestRegressor', 'BaggingRegressor', 'SVR', 'PoissonRegressor',
    'KernelRidge', 'LinearSVR', 'NuSVR', 'GaussianProcessRegressor',
    'TweedieRegressor', 'PassiveAggressiveRegressor', 'MLPRegressor', 'DummyRegressor'
]
# list of time series specialized models
specialized_models = ['AutoReg', 'ARIMA', 'SARIMAX']

# 100 sample time series for modeling
sample_series_keys = list(np.random.choice(keys, 50, replace=False))

# define analysis params dict
analysis_params = {
    'models_names': regressors_names + specialized_models,
    'keys': sample_series_keys,
    'test_size': 2,
    'min_train_size': 1,
}
# save and load dict as json file
# json.dump(analysis_params, open('data/analysis_params.json', 'w'))
analysis_params = json.load(open('data/analysis_params.json', 'r'))

# 3. Predicting and scoring with each model for each time serie

### 3.1 Instantiating model class object

In [4]:
ts_model = model(
    series=lab_series,
    ** analysis_params
)

### 3.2 Grid scoring mdoels for each time serie

In [None]:
keys_models_lc = ts_model.score_keys_models(
    path='scores/',
    filename='scores.csv',
    path_partial='partial/',
    verbose=1,
)

Keys scored: 7/50 - scores.csv


### 3.3 Concatenating partial scores (Optional)

In [None]:
# scrs = model_selection.concat_partial_scrs('scores/partial/')
# scrs.to_csv('scores/scores.csv', index=True)

# 4. Selecting models based on scores and additional parameters

In [None]:
selector = model_selection()

model_selection_params = json.load(open('data/model_selection_params.json', 'r'))
keys_models_maps = selector.model_selection_optimization(
    model_selection_params,
    scrs_path='scores/scores.csv', scrs=None,
    save_path='models/default_analysis/', verbose=0,
)

# 5. Predicting multiple sequences

In [None]:
Model = series_model(
    lab_ind_series, keys=None, target=target,
    # model_path='models/default_analysis/criteria-wape.json'
    # Uncoment and pass 'keys' parameter to predict with single model
)

### 5.2 Predicting sequences for multiple models

In [None]:
# input path to folder of models (key-model maps) and save_path to folder to save predictions
models_preds = Model.maps_predictions(
    x_min=2, x_max=48, test_size=2,
    min_train_size=1, max_train_size=50,
    min_test_size=0, dropna=False,
    path='models/default_analysis/', filter_by='.',
    save_path='predictions/default_analysis/'  #  change 'min_test_size' to zero and 'dropna' to False
)                                                    #  to make future predictions

# 6. Scoring models' Prediction sequences

In [None]:
Ytrue = data[target].dropna()

seq_scorer = sequence_scorer(
    Ytrue, target, criteria='wape', avg=False,
    metrics=['mae', 'estd', 'max_error', 'mse', 'wape', 'r2', 'mpe'],
    indexes=prodtype_indexes
)

In [None]:
models_scrs = seq_scorer.score_models_prediction_sequences(preds_path='predictions/default_analysis/', filter_by='.')

# 7. Visualizing and comparing models performances

In [None]:
seq_scorer.plot_models_sequence_scrs(
    models_scrs, figsize=(12, 7),
#     ylim=[(0.04, 0.5), (0.115, 0.25), (-0.005, 0.06), (0.002, 0.08)],
    legend=False, leg_loc=(1,0), leg_i=1
)

# 8. Visualizing final model predictions

### 8.1 Reloading selected model predictions

In [None]:
pred_path = 'predictions/default_analysis/default/'
model_path = 'models/default_analysis/default.json'

yhat_i = sequence_scorer().load_csv_folder(pred_path)
default_model = json.load(open(model_path))

### 8.2 Random series real versus predicted values (one and two steps predictions)

In [None]:
sequence_scorer().plot_random_pred(
    X, default_model, yhat_i,
    exclude=[], n_series=12, n_cols=3, figsize=[5, 3],
    X_params={'marker': 'o', 'ms': 6, 'lw': 4},
    X0_params={'marker': 'x', 'ms': 7, 'lw': 3},
    X1_params={'marker': 'x', 'ms': 7, 'lw': 3},
    save_path='plots/',
    filename='predictions.jpg'    
)