# Exercise 7: Auto Time Series

In this section we now want to __use the Auto TS capability__ for exploring multiple pipelines and selecting the best pipeline. The selected pipeline will be used for calling predictions.

The HANA ML Python Client documentation can be browsed at [Python Machine Learning Client for SAP HANA](https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_3_QRC/en-US/hana_ml.html)

### Import the required Python packages

In [None]:
%run ./02-setup.ipynb

In [None]:
import uuid

from hana_ml.algorithms.pal.auto_ml import AutomaticTimeSeries
from hana_ml.dataframe import ConnectionContext
from hana_ml.model_storage import ModelStorage
from hana_ml.visualizers.automl_progress import PipelineProgressStatusMonitor
from hana_ml.visualizers.automl_report import BestPipelineReport
from hana_ml.visualizers.unified_report import UnifiedReport
import matplotlib.pyplot as plt

### Check the setup and connect to the database

## Prepare data for processing

We want to reuse the same set of data from the previous session

In [None]:
hdf_fuleprices = conn.table('FUEL_PRICES')

# in order to predict the last 7 days, we restict our training data to be earlier than 2022-09-23
hdf_train_rnk  = hdf_fuleprices.filter('"date" < \'2022-09-23 00:00:00.000\'')

# ground truth
hdf_test_groundtruth_rnk  = hdf_fuleprices.filter('"date" >= \'2022-09-23 00:00:00.000\'')

# create test dataset, same as ground truth only target column values set to 0
hdf_test_rnk = hdf_test_groundtruth_rnk.drop(['e5'])
hdf_test_rnk = hdf_test_rnk.add_constant('e5', 0)
hdf_test_rnk = hdf_test_rnk.cast('e5', 'DOUBLE')

#hdf_test_groundtruth_rnk.sort('date').head(3).collect()
print('Number of forecast training rows', hdf_train_rnk.count())
print('Number of forecast testing rows', hdf_test_rnk.count())

hdf_test_rnk.save('TEST_RNK', force=True)
hdf_test_groundtruth_rnk.save('TEST_GROUNDTRUTH_RNK', force=True)
hdf_train_rnk.save('TRAIN_RNK', force=True)

In [None]:
hdf_test_rnk = conn.table('TEST_RNK')
hdf_test_groundtruth_rnk = conn.table('TEST_GROUNDTRUTH_RNK')
hdf_train_rnk = conn.table('TRAIN_RNK')

For this exercise, we are using the data related to a single fuel station. You can run multiple tests just changing the station id.

In [None]:
station='018e8f3e-ae2f-40bc-89c1-bc3fe20eb462'

Filter train, test and ground truth data for the selected station

In [None]:

hdf_train_1s=hdf_train_rnk.filter('"station_uuid" = \'{}\''.format(station)).sort('date', desc=False)
hdf_train_1s=hdf_train_1s.drop('station_uuid')

hdf_test_groundtruth_1s=hdf_test_groundtruth_rnk.filter('"station_uuid" = \'{}\''.format(station)).sort('date', desc=False)
hdf_test_groundtruth_1s=hdf_test_groundtruth_1s.drop('station_uuid')

hdf_test_1s=hdf_test_rnk.filter('"station_uuid" = \'{}\''.format(station)).sort('date', desc=False)
hdf_test_1s=hdf_test_1s.drop('station_uuid').drop('e5')

## Instantiate AutoML `auto_ts` object

Parametrization of `auto_ts` instance: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2024_1_QRC/en-US/pal/topics/genetic_optimization_automl.html#control-parameters

In [None]:
# AutomaticTimeSeries init 
progress_id = "automl_{}".format(uuid.uuid1())
auto_ts = AutomaticTimeSeries(generations=10,
                                     progress_indicator_id=progress_id,
                                     population_size=30,
                                     early_stop=0,
                                     offspring_size=30)

In [None]:
# enable_workload_class
auto_ts.enable_workload_class(workload_class_name="PAL_AUTOML_WORKLOAD")

Display the current configuration

In [None]:
auto_ts.display_config_dict()

### Get an overview of pipeline operators and parameters

In [None]:
from hana_ml.algorithms.pal import auto_ml

In [None]:
auto_ml.get_pipeline_info(conn).collect()

In [None]:
from IPython.display import HTML
HTML(auto_ml.get_pipeline_info(conn).filter("NAME='AMTSA'").collect().to_html())

In [None]:
eval(auto_ml.get_pipeline_info(conn).filter("NAME='AMTSA'").collect()["SCHEMA"][0])

### Experiment with [pipeline setting](https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2023_3_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.auto_ml.AutomaticClassification.html#hana_ml.algorithms.pal.auto_ml.AutomaticClassification.delete_config_dict)

In [None]:
# auto_c.delete_config_dict(category="Resampler")
# auto_c.delete_config_dict(category="Transformer")

In [None]:
#auto_ts.delete_config_dict(operator_name='AMTSA')
auto_ts.delete_config_dict(operator_name='BSTS')
auto_ts.delete_config_dict(operator_name='ARIMA')
auto_ts.delete_config_dict(operator_name='AutoExpSm')
auto_ts.delete_config_dict(operator_name='BrownExpSm')
auto_ts.delete_config_dict(operator_name='HGBT_TimeSeries')
auto_ts.delete_config_dict(operator_name='MLR_TimeSeries')
auto_ts.delete_config_dict(operator_name='MLR_TimeSeries')

In [None]:
auto_ts.display_config_dict()

## Best pipeline determination

### Start Progress Status Monitor

In [None]:
# invoke a PipelineProgressStatusMonitor
progress_status_monitor = PipelineProgressStatusMonitor(
    # need to create a connection context different from the one of AutoML
    connection_context=conn, 
    automatic_obj=auto_ts)

In [None]:
progress_status_monitor.start()

# training
try:
  auto_ts.fit(data=hdf_train_1s, key="date", endog="e5")
except Exception as e:
    raise e

### Get a report for the Best Pipeline

In [None]:
BestPipelineReport(auto_ts).generate_notebook_iframe()

In [None]:
auto_ts.get_best_pipeline()

In [None]:
auto_ts.model_[0].head(5).collect()

In [None]:
auto_ts.model_[1].collect()

## Save the model in SAP HANA db

In [None]:
ms = ModelStorage(conn)
auto_ts.name = 'AutoTS Model'
auto_ts.version = 1
ms.save_model(model=auto_ts, if_exists='replace')

In [None]:
ms.list_models()

In [None]:
null = None
false = False
true = True
eval(ms.list_models(name=auto_ts.name, version=auto_ts.version).at[0, 'JSON'])

In [None]:
print(auto_ts.best_pipeline_.collect())

## Call prediction

In [None]:

fc_result = auto_ts.predict(data=hdf_test_1s, key="date")


Look at the executed statement on the DB

In [None]:
print(auto_ts.get_predict_execute_statement())

Look at the forecast result data

In [None]:
display(fc_result.collect())

In [None]:
forecast_1s=fc_result.sort('ID')
forecast_1s=forecast_1s.rename_columns({'ID': 'date'}).rename_columns({'SCORES': 'PREDICTED'}).cast({"date": "TIMESTAMP"}).cast({"PREDICTED": "DOUBLE"})

display(forecast_1s.head(10).collect())

In [None]:
actual_1s=hdf_test_groundtruth_1s.rename_columns({'e5': 'ACTUAL'})

In [None]:
display(actual_1s.head(10).collect())

### Forecast-Lineplot for the forecasted week

In [None]:

from hana_ml.visualizers.visualizer_base import forecast_line_plot
ax = forecast_line_plot(actual_data=actual_1s.set_index("date"),
                            pred_data=forecast_1s.set_index("date"),                
                            max_xticklabels=10, 
                            enable_plotly=False,
                            actual_option={'zorder': 1 , 'alpha' : 0.5})    

ax.set_title('Fuel Price Actual and Forecast', pad=20)
plt.ylabel('Gas Price e5 [€]')
plt.show()


In order to __evaluate the forecast accuracy__ , we need to comparing predicted forecast values (from the predict-results) with the actual ground-truth e5-values of the test data time period, , which we join together using __dataframe-join methods__.

In [None]:

# Join actual and predicted values in a new dataframe
hdf_comparison=actual_1s.alias('A').join(forecast_1s.alias('F'), 
          'A."date" = F."date"',
          select=[ ('A."date"', 'DATE'), 'ACTUAL', 'PREDICTED']).sort('DATE')
hdf_comparison=hdf_comparison.sort('DATE')
 
display(hdf_comparison.head(10).collect())

In [None]:
# Calculate Forecast Accuracy Measure
from hana_ml.algorithms.pal.tsa.accuracy_measure import accuracy_measure

amres = accuracy_measure(data=hdf_comparison.select(['ACTUAL', 'PREDICTED']), evaluation_metric=['mse', 'rmse', 'mpe', 'et',
                                              'mad', 'mase', 'wmape', 'smape',
                                              'mape'])

amres.collect()
   