## Vector Autoregression (VAR)

In [86]:
# Import all required libraries
import pickle
import numpy as np

from statsmodels.tsa.vector_ar.var_model import VAR
from sklearn.metrics import mean_squared_error

In [87]:
# Import feature and target columns
df = pickle.load(open("./common/df.p", "rb"))
n_obs_var = pickle.load(open("./common/n_obs.p", "rb"))
n_splits = pickle.load(open("./common/n_splits.p", "rb"))

### Perform Cross Validation

In [88]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def smape(y_true, y_pred):
    return np.mean(2.0 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))) * 100

In [89]:
window_size = len(df) // n_splits

# Initialize a dictionary to store the MSEs, MAPEs, SMAPEs for each variable
mse_dict = {var: [] for var in df.columns}
mape_dict = {var: [] for var in df.columns}
smape_dict = {var: [] for var in df.columns}

for i in range(n_splits):
    if i < n_splits - 1:
        train, test = df[i*window_size:(i+1)*window_size], df[(i+1)*window_size:(i+2)*window_size]
    else:  # If it's the last split, use the rest of the data for the test set
        train, test = df[i*window_size:(i+1)*window_size], df[(i+1)*window_size:]
    
    # Fit the VAR model
    model = VAR(train)
    model_fit = model.fit(maxlags=5, ic='aic')  # Choose order of VAR with Akaike's Information Criterion
    
    # Make predictions on the test set
    n_obs = len(test)  # Dynamically adjust the number of observations to forecast
    predictions = model_fit.forecast(model_fit.model.endog, steps=n_obs)
    
    # Calculate mean squared error for each variable and store it in the dictionary
    for j, var in enumerate(train.columns):
        mse = mean_squared_error(test.iloc[:, j], predictions[:, j])
        mape = mean_absolute_percentage_error(test.iloc[:, j].values, predictions[:, j])
        smape_value = smape(test.iloc[:, j].values, predictions[:, j])
        mse_dict[var].append(mse)
        mape_dict[var].append(mape)
        smape_dict[var].append(smape_value)

# Compute and print the average MSE for each variable
for var in mse_dict:
    avg_mse = sum(mse_dict[var]) / len(mse_dict[var])
    print(f'Average MSE for {var}: {avg_mse}')

print("-----------------------------------------")

# Compute and print the average MAPE for each variable
for var in mape_dict:
    avg_mape = sum(mape_dict[var]) / len(mape_dict[var])
    print(f'Average MAPE for {var}: {avg_mape}')

print("-----------------------------------------")

# Compute and print the average SMAPE for each variable
for var in smape_dict:
    avg_smape = sum(smape_dict[var]) / len(smape_dict[var])
    print(f'Average SMAPE for {var}: {avg_smape}')

Average MSE for cpu_usage: 0.011390920257782638
Average MSE for memory_usage: 1.44648013910263e-05
Average MSE for bandwidth_inbound: 39795744.40062205
Average MSE for bandwidth_outbound: 25708117.381725766
Average MSE for tps: 0.6792662484422436
Average MSE for tps_error: 0.18945668547028954
Average MSE for response_time: 514198.4374786808
-----------------------------------------
Average MAPE for cpu_usage: 126.821342258525
Average MAPE for memory_usage: 0.6659487996310652
Average MAPE for bandwidth_inbound: 1076.484432610586
Average MAPE for bandwidth_outbound: 1032.9534354092707
Average MAPE for tps: 55.348696479226135
Average MAPE for tps_error: inf
Average MAPE for response_time: 49.42706820523503
-----------------------------------------
Average SMAPE for cpu_usage: 49.324512652706325
Average SMAPE for memory_usage: 0.6671295296187708
Average SMAPE for bandwidth_inbound: 79.81226331477602
Average SMAPE for bandwidth_outbound: 73.20974637631039
Average SMAPE for tps: 38.336445235

  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


### Train the model

In [90]:
# Split your data
train, test = df[:-n_obs_var], df[-n_obs_var:]

len df:  2
len train:  2112
len test:  10


In [91]:
# Now fit a VAR model on the training dataset
model = VAR(train)
results = model.fit(maxlags=15, ic='aic')

In [92]:
# Print a summary of the results
print(results.summary())

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 03, Jun, 2023
Time:                     11:41:28
--------------------------------------------------------------------
No. of Equations:         7.00000    BIC:                    7.06579
Nobs:                     2098.00    HQIC:                   5.88337
Log likelihood:          -25600.3    FPE:                    181.347
AIC:                      5.19992    Det(Omega_mle):         131.323
--------------------------------------------------------------------
Results for equation cpu_usage
                            coefficient       std. error           t-stat            prob
-----------------------------------------------------------------------------------------
const                          0.419921         0.213094            1.971           0.049
L1.cpu_usage                   0.266378         0.029673            8.977           0.000
L1.memory_usage     

### Visualize the model

### Export the model