In [1]:
import numpy as np
import tensorflow as tf

from scada_forecast.preprocess import read_humidity, read_temperature, read_scada
from scada_forecast.preprocess import get_lag_features, merge_dataframes, add_calendar_features, prepare_data

from scada_forecast.model import train_model, inference

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [2]:
hour_steps = int(60 / 5)
use_temp = False
use_humidity = True
use_lunar = True
use_holiday = False

## Train

In [3]:
df_scada = read_scada('data/scada/Dữ liệu SCADA Phụ tải 26.08.2020.xlsx')
df_scada = get_lag_features(df_scada, hour_steps, clip_df=False)

df_humidity = None
df_temperature = None

if use_humidity:
    df_humidity = read_humidity('data/scada/DoAm.xlsx')
if use_temp:
    df_temperature = read_temperature('data/NhietDoQuaKhu.xlsx')
    
df = merge_dataframes(df_scada, df_temperature, df_humidity)

In [4]:
df = add_calendar_features(df, use_lunar=use_lunar, use_holiday=use_holiday)

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=173664.0), HTML(value='')))




In [5]:
df, features_dict, categorical_cols, numeric_cols, target_col = prepare_data(df,
                                                                             use_temp=use_temp, 
                                                                             use_humidity=use_humidity, 
                                                                             use_lunar=use_lunar,
                                                                             use_holiday=use_holiday)

In [6]:
dtype_dict = {col: df[col].dtype for col in categorical_cols + numeric_cols}

In [7]:
categorical_unique_values_dict = {
    'Month': list(range(1, 12 + 1)),
    'DayOfYear': list(range(1, 365 + 1)),
    'DayOfWeek': list(range(7)),
    'Hour': list(range(24)),
    'LunarMonth': list(range(1, 12 + 1)),
    'LunarDayOfMonth': list(range(1, 30 + 1)),
    'HolidayCat': list(range(12))
}

In [8]:
n_val = 100 * 24 * hour_steps # 100 days
n_test = 100 * 24 * hour_steps # 100 days
input_width = hour_steps
chosen_features = (features_dict['calendar_features'] + features_dict['lag_features'] + 
                   features_dict['temperature_features'] + features_dict['humidity_features'])
ckpt_path = 'tmp/ckpt'

In [9]:
train_model(df, input_width, n_val, n_test, hour_steps,
            dtype_dict, categorical_unique_values_dict, 
            chosen_features, categorical_cols, 
            numeric_cols, target_col, ckpt_path)

Epoch 1/10


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Accuracy: 65.70%, MAPE: 1.96%
Test Accuracy: 79.98%, MAPE: 1.31%


(0.01961184134283156,)

## Inference

In [10]:
forecast_horizon = 4

In [11]:
df_scada2 = read_scada('data/scada/Dữ liệu SCADA Phụ tải 26.08.2020.xlsx')
df_scada2 = get_lag_features(df_scada2, hour_steps, clip_df=True, input_width=hour_steps, forecast_horizon=forecast_horizon)

In [12]:
df2 = merge_dataframes(df_scada2, df_temperature, df_humidity)
df2 = add_calendar_features(df2, use_lunar=use_lunar, use_holiday=use_holiday)
df2, features_dict, categorical_cols, numeric_cols, target_col = prepare_data(df2, train_flag=False,
                                                                              use_temp=use_temp, 
                                                                              use_humidity=use_humidity,
                                                                              use_lunar=use_lunar,
                                                                              use_holiday=use_holiday)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [13]:
forecast_df = inference(df2, input_width, ckpt_path,
                        dtype_dict, categorical_unique_values_dict, 
                        chosen_features, categorical_cols, numeric_cols)
forecast_df

Unnamed: 0_level_0,Forecast
Date,Unnamed: 1_level_1
2020-08-26 00:40:00,29139.447266
2020-08-26 00:45:00,29038.46875
2020-08-26 00:50:00,29044.976562
2020-08-26 00:55:00,28926.195312


In [14]:
df.index[-1]

Timestamp('2020-08-25 23:55:00', freq='5T')