In [76]:
import glob
import numpy as np
import pandas as pd
from math import sqrt
from random import random
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.api import VAR

In [41]:
valves = ["20-LV-1031_Z_X_Value", "20-LV-1031_Z_Y_Value", "20-LV-1034_Z_X_Value",
              "20-LV-1034_Z_Y_Value", "20-PV-1037_Z_X_Value", "20-PV-1037_Z_Y_Value"]
dfs = {file.split("_")[-4]: pd.read_pickle(file) for file in \
           glob.glob("S:\SRH\BDBA_Sem_2\Case_study_1\data\*.pkl")}
    
dfs_sorted = dict(sorted(dfs.items()))
df_single = pd.concat(dfs_sorted, axis=0)
valve_df = df_single.filter(valves)
# valve_df = valve_df.droplevel(0, axis=0)

In [38]:
def preprocessed_df(df, val_pct):
    """ Creates train, validation and test set after applying normalisation of all feature cols
    Args:
    df: dataframe object
    val_pct: percentage size of validation plus test size (float)
    """
    
    val_data_size = round(df.shape[0] * 0.3)
    test_data_size = round(val_data_size * 0.1)
    
    train_data = df[:-val_data_size]
    val_data = df[-val_data_size:-test_data_size]
    test_data = df[-test_data_size:]
    
    return train_data, val_data, test_data

In [39]:
df_train, df_val, df_test = preprocessed_df(valve_df, 0.3)

In [68]:
model = VAR(endog=df_train)
model.select_order(15)
results = model.fit(maxlags=15, ic='aic')
lag_order = results.k_ar
# make prediction on validation
prediction = results.forecast(df_val.values[-lag_order:], steps=len(df_val))
# results.plot_forecast(10)



In [69]:
#make final predictions on test data
yhat = results.forecast(df_test.values[-lag_order:], steps=len(df_test))
print(yhat)

[[ 2.0358389  48.85068555  0.80322803 42.53197714 21.7976603  31.83912199]
 [ 2.03839547 49.38786287  0.80575659 42.34297744 21.82579703 31.69456884]
 [ 2.03833063 49.78368215  0.80820124 42.2168713  21.85399521 31.67140004]
 ...
 [ 1.84470835 61.0733276   0.78654044 61.68847803 17.96393622 55.33006348]
 [ 1.8446856  61.07645423  0.78654137 61.69335724 17.96286838 55.33680178]
 [ 1.84466298 61.07956866  0.78654229 61.69821744 17.96180473 55.34351376]]


In [73]:
#converting predictions to dataframe
pred = pd.DataFrame(index=range(0,len(yhat)),columns=[valves])
for j in range(0,6):
    for i in range(0, len(yhat)):
        pred.iloc[i][j] = yhat[i][j]

In [78]:
#check rmse
for i in valves:
    print('rmse value for', i, 'is : ', sqrt(mean_squared_error(pred[i], df_test[i])))

rmse value for 20-LV-1031_Z_X_Value is :  0.24805180567815505
rmse value for 20-LV-1031_Z_Y_Value is :  8.18105484739458
rmse value for 20-LV-1034_Z_X_Value is :  0.08317445232257809
rmse value for 20-LV-1034_Z_Y_Value is :  11.675940411958102
rmse value for 20-PV-1037_Z_X_Value is :  2.7260269775195645
rmse value for 20-PV-1037_Z_Y_Value is :  14.884324699459327
