In [None]:
### Step 1: Read flow data
import numpy as np
def aggregate_flows(flow_data):
	inflow_list = []
	outflow_list = []
	timestamp_list = []
	for index, _ in flow_data.iterrows():
		index_str = str(index)
		timestamp = pd.Timestamp(index_str)
		inflows = {}
		outflows = {}
		for name, values in flow_data.loc[timestamp].items():
			country_from, country_to = str(name).split('>')
			inflows.setdefault(country_to, 0)
			outflows.setdefault(country_from, 0)
			inflows[country_to] += values
			outflows[country_from] += values
		inflow_list.append(inflows)
		outflow_list.append(outflows)
		timestamp_list.append(timestamp)
	inflow_df = pd.DataFrame(data=inflow_list, index=timestamp_list)
	outflow_df = pd.DataFrame(data=outflow_list, index=timestamp_list)

	return inflow_df, outflow_df

# Read data
import pandas as pd
pd. set_option('display.max_rows', None) 
flow_data = "data/preprocessed_data.csv"
flow_data = pd.read_csv(flow_data, parse_dates=True, index_col=0)

# Add some noise #TODO necessary/other way? (fails without, for some reason)
noise = np.random.uniform(0.0, 0.0001, flow_data.shape) 
flow_data = flow_data + noise

# Generate aggregated data. #TODO russia, ukraine, belarus, turkey, albania, macedonia inflow only?
inflow_df, outflow_df = aggregate_flows(flow_data)
inflow_df.info()
outflow_df.info()

# Normalize to 0-1 range
inflow_df=(inflow_df-inflow_df.min().min())/(inflow_df.max().max()-inflow_df.min().min())
outflow_df=(outflow_df-outflow_df.min().min())/(outflow_df.max().max()-outflow_df.min().min())

In [None]:
### Step 1b: Split into training and testing data
cutoff = pd.Timestamp("2021-01-01 00:00:00+00:00")
inflow_train = inflow_df.loc[:cutoff, :]
inflow_test = inflow_df.loc[cutoff:, :]
outflow_train = outflow_df.loc[:cutoff, :]
outflow_test = outflow_df.loc[cutoff:, :]

In [None]:
# Step 2: Create Vector Autoregressive Model and determine best lag order
from statsmodels.tsa.vector_ar.var_model import VAR
inflow_model = VAR(inflow_train, freq='H')
outflow_model = VAR(outflow_train, freq='H')
inflow_res = inflow_model.select_order(maxlags=8)
outflow_res = outflow_model.select_order(maxlags=8)

inflow_res.summary()

In [None]:
# Step 3: Train model
lag_order = 3
trained_inflow = inflow_model.fit(lag_order)
trained_outflow = outflow_model.fit(lag_order)

In [None]:
# Step 4: Evaluate model
print("Inflow FPE: ", trained_inflow.fpe)
print("Outflow FPE: ", trained_outflow.fpe)

In [None]:
# TODO Step 4b: Do Forecast with trained model
from tqdm import tqdm

# Inflow model predictions
inflow_forecasts = []
index_list = []
for index, _ in tqdm(inflow_test.iterrows(), total=inflow_test.shape[0]):
	input_df = inflow_df.loc[:index].iloc[:-1].iloc[-lag_order:]
	input_array = input_df.values
	fc = trained_inflow.forecast(input_array, steps=1)[0]
	inflow_forecasts.append(fc)
	index_list.append(index)
inflow_preds = pd.DataFrame(data=inflow_forecasts, index=index_list, columns=inflow_test.columns)
inflow_preds.info()

# Outflow model predictions
outflow_forecasts = []
index_list = []
for index, _ in tqdm(outflow_test.iterrows(), total=outflow_test.shape[0]):
	input_df = outflow_df.loc[:index].iloc[:-1].iloc[-lag_order:]
	input_array = input_df.values
	fc = trained_outflow.forecast(input_array, steps=1)[0]
	outflow_forecasts.append(fc)
	index_list.append(index)
outflow_preds = pd.DataFrame(data=outflow_forecasts, index=index_list, columns=outflow_test.columns)
outflow_preds.info()

In [None]:
### Step 4c: Plot predictions and compute errors\
import matplotlib.pyplot as plt
from tqdm import tqdm

# Inflows
fig_in, axes_in = plt.subplots(len(inflow_test.columns)//4, ncols=4, dpi=150, figsize=(20, 100))
for (col, ax) in tqdm(zip(inflow_test.columns, axes_in.flatten())):
	inflow_test[col].plot(ax=ax)
	inflow_preds[col].plot(ax=ax)
	ax.legend(["true", "prediction"])
	ax.set_title(str(col) + ": Forecast vs Actuals")

# Outflows
fig_out, axes_out = plt.subplots(len(outflow_test.columns)//4, ncols=4, dpi=150, figsize=(20,100))
for (col, ax) in tqdm(zip(outflow_test.columns, axes_out.flatten())):
	outflow_test[col].plot(ax=ax)
	outflow_preds[col].plot(ax=ax)
	ax.legend(["true", "prediction"])
	ax.set_title(str(col) + ": Forecast vs Actuals")