# Importing needed packages and Main Parameters

In [16]:
# Standard:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import os
import sys
import pkg_resources
import json
import pickle
import psutil

# From forecast_utils
levels = 2
sys.path.append(os.path.abspath(os.path.join(".", "../"*levels)))
import forecast_utils as utils

# Prophet Package:
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.make_holidays import make_holidays_df
from prophet.utilities import regressor_coefficients
from prophet.plot import plot_cross_validation_metric
from prophet.serialize import model_from_dict, model_to_dict, model_from_json, model_to_json, SIMPLE_ATTRIBUTES

# Hyperopt:
from hyperopt import fmin, tpe, hp, anneal, Trials, SparkTrials, STATUS_OK, space_eval

# MLflow:
import mlflow
import mlflow.prophet
from mlflow.models.signature import infer_signature
mlflow_path = f"file://{os.path.join(utils.root_path, 'mlruns')}"



# Definitions:
granularity = "Monthly"
category = "Non-domestic"

# Do not show Warnings
import warnings
warnings.filterwarnings("ignore")

# Suppress log messages from cmdstanpy and prophet
import logging
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)
logging.getLogger('prophet').setLevel(logging.ERROR)

pd.options.display.max_columns = 100

### Pre-processing

In [17]:
# ## Data Pre-processing:
# - Demand.
# - Regressors: GDP, COVID Abnormalities, Yearly Seasonality, Monthly Weekdays
# - Extra regressors that could be added:
#   - Hours of daylight a month.
#   - Number of weekends for Domestic Data.
#   - etc.
df = utils.make_complete_input_df(granularity=granularity[0], category=category)

### Creating future DataFrame for forecasting with the utilised Regressors, Seasonality, etc.
future_df = utils.make_forecast_df(granularity=granularity[0], category=category)



## Modelling
### Dummy Model Initialisation for easier Hyper-parameters' optimisation spaces
model = utils.default_prophet_model(category=category, granularity=granularity[0])
for reg in df.drop(columns=['ds', 'sector', 'y']).columns:
  model.add_regressor(reg,
                      mode='multiplicative')

### HyperOpt Search Distributions, Search Spaces, Search Algorithm, etc.
# HyperOpt:
monthly_base_tuning = utils.monthly_base_tuning
monthly_base_hyperopt = utils.monthly_base_hyperopt
monthly_non_dom_regressors = utils.monthly_non_dom_regressors

# Creating the HyperOpt Space for the tuning [We could use log instead of uniform?]
space, space_dict = utils.create_hyperopt_space(model)

# HyperOpt algorithm and +
algo = tpe.suggest
max_evals = 1
trials = Trials()
rstate = np.random.default_rng(42)
random_state = 42


### HyperOpt's Prophet Optimization Function

In [18]:
### Function for HyperOpt to run.
# - The function is literally a model fitting with the HyperOpt variables and scoring depending on the cross_validation's metric (MAPE here)

def hyperopt_tuning(hyperopt):

  global model

  # Creating the forecasting model with the variables we are using: multiplicative, holidays, regressors, seasonalities.
  model = utils.default_prophet_model(category=category, granularity=granularity[0])

  for reg in df.drop(columns=['ds', 'sector', 'y']).columns:
    model.add_regressor(reg,
                        mode='multiplicative')

  # Using Hyperopt to tune the base hyperparams: changepoint_prior_scale, seasonality_prior_scale, holidays_prior_scale, changepoint_range.
  for hyperparam in monthly_base_tuning:
    setattr(model, hyperparam, hyperopt[hyperparam])

  # Using Hyperopt to tune the extra hyperparams: regressors unique to the model.
  for regressor_hyperparam in [i for i in model.extra_regressors]:
    model.extra_regressors[regressor_hyperparam]['prior_scale'] = hyperopt[regressor_hyperparam]

  # Fitting the model with the hyperopt variables to optimize them using the aglorithm "Tree of Parzen Estimators" by default. This can be changed.
  model.fit(df)

  # Scoring parameters for each iteration in the optimization.
  df_cv, df_p, score = utils.scoring_outputs(model)

  return {
  'loss' : score,
  'status' : STATUS_OK,
  'cutoff_points' : df_p['cutoff_points'][0], 
  'horizon_days' : df_p['horizon'][0].days,
  'metrics' : dict(zip(df_p.columns[1:-2], df_p.values[0][1:-2])),
  'prophet_dict' : model_to_dict(model),
  'category' : category,
  'granularity' : granularity,
  'hopt_algorithm' : algo.__module__,
  'max_iters' : max_evals,
  'random_state' : random_state,
  'training_datetime' : datetime.datetime.today()
  } 

### Running Optimization and tracking with mlflow
- In this case the Loss Score is very high because of the artificial seasonalities placed because of COVID.
- The model, at a cut_off point "2021" can't fit the extra seasonalities that are applied to the model.
- Still, the model is getting the posterior distributions by fitting the whole dataset

In [19]:
# Experiments, Runs, Model values
mlflow.set_tracking_uri(mlflow_path)
exp_name = f"GB_{granularity}"
experiment = mlflow.set_experiment(exp_name)
experiment_id = experiment.experiment_id
run_name = str(datetime.datetime.today())[:19].replace(' ','_') + '_' + category.lower().replace('-', '_') + '_' + granularity.lower()
model_name = 'model' + '_' + category.lower().replace('-', '_') + '_' + granularity.lower()

# Datasets
category_short_string = f"{category}".lower().replace("estic","").replace("-","_")
dataset_1 = mlflow.data.from_pandas(df, source=utils.demand_path, name=f"{category_short_string}_{granularity.lower()}_train_dataset", targets="y")
dataset_2 = mlflow.data.from_pandas(future_df, source=utils.demand_path, name=f"{category_short_string}_{granularity.lower()}_test_dataset")
if model.holidays is not None:
  dataset_3 = mlflow.data.from_pandas(model.holidays, name="holidays")

mlflow.enable_system_metrics_logging()

# Running the HyperOpt Function with mlflow as a context manager:
with mlflow.start_run(run_name=run_name, experiment_id = experiment_id, log_system_metrics=True):

  # In case we are wondering the paths for mlflow
  tracking_uri = mlflow.get_tracking_uri() # for the log_metric, log_param
  artifact_uri = mlflow.get_artifact_uri() # for the log_artifact
  run_id = mlflow.active_run().info.run_id # the run_id

  # Hyperparameter Optimisation: Getting the "best" prior distributions
  hyperopt_optimization = fmin(fn=hyperopt_tuning,
                                space = space,
                                algo= algo,
                                max_evals=max_evals,
                                trials= trials,
                                rstate= rstate)

  # Retrieving the best best_model from the optimization by hyperopt + forecast.
  best_model = model_from_dict(trials.best_trial['result']['prophet_dict'])
  best_model.granularity = granularity[0]
  best_model.forecast = best_model.predict(future_df)
  best_model.full_forecast = utils.full_forecast_df(best_model, future=future_df, forecast=best_model.forecast)
  best_model.reduced_forecast = utils.reduced_forecast_df(best_model, best_model.full_forecast)



  ### MLflow:

  ## MLflow: Basic definition: (In/Out)
  # These are the steps to save the needed objects from the training.
  # Signature (in/out schema):
  signature = infer_signature(df, best_model.reduced_forecast)


  ## MLflow: Getting variables to save, archive:
  # Input DataFrame:
  df.to_parquet('input_df.parquet')
  future_df.to_parquet('future_df.parquet')

  # Input holidays if any:
  if best_model.holidays is not None:
    best_model.holidays.to_parquet('input_holidays.parquet')

  # Output DataFrame:
  best_model.full_forecast.to_parquet('forecast_full_df.parquet')
  best_model.reduced_forecast.to_parquet('forecast_reduced_df.parquet')

  # Getting the serialized best_model:
  with open('prophet_model.json', 'w') as json_file:
      json_file.write(model_to_json(best_model))

  # Parameters of the best best_model:
  with open('prophet_model_parameters.pkl', 'wb') as file:
      pickle.dump(best_model.params, file)

  # Coefficients and values of the extra_regressors:
  regressor_coefficients(best_model).to_parquet('regressors_coefficients.parquet')

  # Best best_model's extra_regressors:
  with open('prophet_model_extra_regressors.pkl', 'wb') as file:
      pickle.dump(best_model.extra_regressors, file)
  
  # Best best_model's fourier series seasonalities:
  if len(best_model.seasonalities) > 0:
    with open('prophet_model_fourier_seasonalities.pkl', 'wb') as file:
        pickle.dump(best_model.seasonalities, file)

  # Best best_model's Pre-processed (by Prophet) inputs:
  preprocess = best_model.preprocess(df)
  with open('prophet_model_preprocessed_params.pkl', 'wb') as file:
      pickle.dump(preprocess, file)

  # Initial hyperparams:
  with open('prophet_model_initial_hyperparams.pkl', 'wb') as file:
      pickle.dump(best_model.calculate_initial_params(num_total_regressors=preprocess.K), file)

  # The main inputs into the Prophet object:
  params_prophet = {}
  for i in [j for j in SIMPLE_ATTRIBUTES if j not in ['component_modes', 'country_holidays', 'logistic_floor']]:
    params_prophet[i] = (getattr(best_model,i))

  # Saving HyperOpt outputs:
  hopt = utils.hyper_params_df(trials).to_parquet("hyperopt_run.parquet")

  # Saving graphs:
  utils.plot_forecast_changepoints(best_model).savefig("forecast_changepoints.png")
  plt.close()
  utils.plot_base_components(best_model).savefig('base_components.png')
  plt.close()
  utils.plot_noise_ts(best_model).savefig("errors_timeseries.png")
  plt.close()
  utils.plot_error_hist(best_model).savefig("errors_histogram.png")
  plt.close()
  utils.plot_inputs_in_time(best_model).savefig("variables_in_time.png")
  plt.close()
  utils.plot_regressors(best_model).savefig("regressors_in_time.png")
  plt.close()
  utils.plot_regressors_linearity(best_model).savefig("regressors_impact_on_trend.png")
  plt.close()


  ## MLflow: Logging variables (Parameters, Metrics, Artifacts):
  # Inputs (datasets):
  mlflow.log_input(dataset_1, context="training")
  mlflow.log_input(dataset_2, context="testing")
  if best_model.holidays is not None:
    mlflow.log_input(dataset_3, context="holidays")

  # Paramters used when running the model:
  mlflow.log_param("horizon_days", trials.best_trial['result']['horizon_days'])
  mlflow.log_param("cutoff_points", trials.best_trial['result']['cutoff_points'])
  mlflow.log_param("hopt_algorithm", algo.__module__)
  mlflow.log_param("hopt_space", space_dict)

  for key, value in params_prophet.items():
    mlflow.log_param(key, value)

  if len(best_model.seasonalities) > 0:
    mlflow.log_param("seasonalities", best_model.seasonalities)
  mlflow.log_param("regressors", best_model.extra_regressors)

  # Logging metrics: these are numeric variables depicting scores of the model.
  for metric, value in trials.best_trial['result']['metrics'].items():
    mlflow.log_metric("cross_validation_{}".format(metric), value)
  mlflow.log_metric("cross_validation_mape", trials.best_trial['result']['loss'])
  mlflow.log_metric("hopt_max_iters", len(trials.trials))
  mlflow.log_metric("hopt_random_state", random_state)

  # Logging artifacts: these are files like json, tables, pictures, etc.
  mlflow.log_artifact(local_path = "input_df.parquet", artifact_path ="datasets")
  mlflow.log_artifact(local_path = "future_df.parquet", artifact_path ="datasets")
  if best_model.holidays is not None:
    mlflow.log_artifact(local_path = "input_holidays.parquet", artifact_path = "datasets")
  mlflow.log_artifact(local_path = "prophet_model.json", artifact_path = "main_model_outputs")
  mlflow.log_artifact(local_path = "prophet_model_parameters.pkl", artifact_path = "main_model_outputs")
  mlflow.log_artifact(local_path = "forecast_full_df.parquet", artifact_path = "forecast_outputs")
  mlflow.log_artifact(local_path = "forecast_reduced_df.parquet", artifact_path = "forecast_outputs")
  mlflow.log_artifact(local_path = "prophet_model_extra_regressors.pkl", artifact_path = "added_model_outputs")
  mlflow.log_artifact(local_path = "regressors_coefficients.parquet", artifact_path = "added_model_outputs")
  if len(best_model.seasonalities) > 0:
    mlflow.log_artifact(local_path = "prophet_model_fourier_seasonalities.pkl", artifact_path = "added_model_outputs")
  mlflow.log_artifact(local_path = "prophet_model_preprocessed_params.pkl", artifact_path = "interim_model_outputs")
  mlflow.log_artifact(local_path = "prophet_model_initial_hyperparams.pkl", artifact_path = "interim_model_outputs")
  mlflow.log_artifact(local_path = "hyperopt_run.parquet", artifact_path = "hyperopt")
  for fig in [graph for graph in os.listdir() if '.png' in graph]:
    mlflow.log_artifact(local_path = fig, artifact_path = "graphs")


  ## MLflow: Getting the requirements.txt for logging the model
  # Get all installed package names and versions from pkg_resources.working_set
  installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
  # Filter only modules that are actually installed packages and get their versions
  imported_modules = {}
  for module_name in sys.modules:
      # Check if the module is an installed package
      if module_name in installed_packages:
          version = installed_packages[module_name]
          imported_modules[module_name] = version
  # Convert to DataFrame for easy reading or export
  modules_df = pd.DataFrame(list(imported_modules.items()), columns=["Package", "Version"])
  modules_df['formatted'] = modules_df["Package"] + "==" + modules_df["Version"]
  modules_df['formatted'].to_csv("requirements.txt", index=False, header=False)
  
  mlflow.prophet.log_model(best_model,
                           artifact_path=model_name,
                           signature=signature,
                           input_example=df,
                           pip_requirements="requirements.txt")

  # We do not need the local files > so we'll delete them as they should be temp.
  for temp in os.listdir():
    if ('.parquet' in temp) or ('.json' in temp) or ('.png' in temp) or ('.txt' in temp) or ('.pkl' in temp):
      os.remove(temp)



2024/10/11 20:32:14 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.65s/trial, best loss: 0.055307066322500084]


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/11 20:32:18 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/11 20:32:18 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [20]:
best_model.reduced_forecast.to_parquet(os.path.join(utils.outputs_folder, 'monthly', f"{category}_{granularity}_reduced_forecast.parquet"))
best_model.full_forecast.to_parquet(os.path.join(utils.outputs_folder, 'monthly', f"{category}_{granularity}_full_forecast.parquet"))