In [5]:
### Step 1: Read inflow data

# Read data
import pandas as pd
pd. set_option('display.max_rows', None) 
inflow_data = "data/cbf_data.csv"
inflow_data = pd.read_csv(inflow_data, parse_dates=True, index_col=0)

# Preprocessing
inflow_data = inflow_data.fillna(method='ffill')
inflow_data = inflow_data.fillna(method='bfill')
inflow_data.info()

# Add some noise
import numpy as np
noise = np.random.normal(0, 0.01, inflow_data.shape) 
inflow_data = inflow_data + noise

# Normalize to 0-1 range
inflow_data=(inflow_data-inflow_data.min())/(inflow_data.max()-inflow_data.min())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 61368 entries, 2014-12-31 23:00:00+00:00 to 2021-12-31 22:00:00+00:00
Columns: 222 entries, BE>NL to HR>SI
dtypes: float64(222)
memory usage: 104.4 MB


In [None]:
### Step 1a: Make data stationary
from statsmodels.tsa.stattools import adfuller

def adfuller_test(series, signif=0.05):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    if p_value <= signif:
        return True
    else:
        return False

# Difference until data is stationary
stationary = False
while not stationary:
    stationary = True
    for name, column in inflow_data.iteritems():
        if not adfuller_test(column, 0.05):
            print("data not stationary, differencing...")
            stationary = False
            inflow_data = inflow_data.diff().dropna()
            break


In [10]:
### Step 1b: Split into training and testing data
cutoff = pd.Timestamp("2021-01-01 00:00:00+00:00")
train_df = inflow_data.loc[:"2021-01-01 00:00:00+00:00", :]
test_df = inflow_data.loc["2021-01-01 00:00:00+00:00":, :]
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52610 entries, 2014-12-31 23:00:00+00:00 to 2021-01-01 00:00:00+00:00
Columns: 222 entries, BE>NL to HR>SI
dtypes: float64(222)
memory usage: 89.5 MB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8759 entries, 2021-01-01 00:00:00+00:00 to 2021-12-31 22:00:00+00:00
Columns: 222 entries, BE>NL to HR>SI
dtypes: float64(222)
memory usage: 14.9 MB


In [11]:
# Step 2: Create Vector Autoregressive Model and determine best lag order
from statsmodels.tsa.vector_ar.var_model import VAR
model = VAR(train_df, freq='H')
x = model.select_order(maxlags=8)
x.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-1198.,-1198.,0.000*,-1198.
1.0,-1574.,-1566.,0.000,-1572.
2.0,-1588.,-1571.*,0.000,-1583.*
3.0,-1590.,-1565.,0.000,-1582.
4.0,-1590.,-1557.,0.000,-1580.
5.0,-1591.,-1549.,0.000,-1578.
6.0,-1591.,-1541.,0.000,-1575.
7.0,-1591.,-1533.,0.000,-1573.
8.0,-1592.*,-1525.,0.000,-1571.


In [15]:
# Step 3: Train model
lag_order = 2
trained_model = model.fit(lag_order) # Put desired lag order here

In [16]:
# Step 4: Evaluate model
print("Params shape: "+ str(trained_model.params.shape))
print("Training data data shape: " + str(train_df.shape))
print("Final prediction error: ", trained_model.fpe)

Params shape: (445, 222)
Training data data shape: (52610, 222)
Final prediction error:  0.0


In [51]:
# TODO Step 4b: Do Forecast with trained model
from tqdm import tqdm

forecast_list = []
index_list = []
for index, _ in tqdm(test_df.iterrows(), total=test_df.shape[0]):
	input_df = inflow_data.loc[:index].iloc[:-1].iloc[-lag_order:]
	input_array = input_df.values
	fc = trained_model.forecast(input_array, steps=1)[0]
	forecast_list.append(fc)
	index_list.append(index)

prediction_df = pd.DataFrame(data=forecast_list, index=index_list, columns=test_df.columns)
prediction_df.info()
test_df.info()

100%|██████████| 8759/8759 [00:02<00:00, 3024.43it/s]


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8759 entries, 2021-01-01 00:00:00+00:00 to 2021-12-31 22:00:00+00:00
Columns: 222 entries, BE>NL to HR>SI
dtypes: float64(222)
memory usage: 14.9 MB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8759 entries, 2021-01-01 00:00:00+00:00 to 2021-12-31 22:00:00+00:00
Columns: 222 entries, BE>NL to HR>SI
dtypes: float64(222)
memory usage: 14.9 MB


In [None]:
### Step 4c: Plot predictions and compute errors