In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

# fix for 'package not found' when installing in Anaconda environment
if 'google.colab' not in str(get_ipython()):
    import pip
    pip.main(['install', 'xgboost'])

if 'google.colab' in str(get_ipython()):
  !rm util.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/util.py

from xgboost import XGBRegressor
from util import select_relevant_rows, select_attributes, read_movement_data, read_multiple_csv,download_updated_mobility_data, download_updated_mobility_data, time_series_cross_validation, train_and_predict, select_time_slot

In [None]:
local_region_path = r'../COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv'
remote_region_path = r'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'

regions_frame = pd.read_csv(remote_region_path)

In [None]:
region_focus = 'Emilia-Romagna'
attribute_focus = 'denominazione_regione'

region_focus_data = select_relevant_rows(
    regions_frame,
    attribute_focus,
    region_focus
    )

frame_interesting_columns = select_attributes(region_focus_data, [
    'data',
    'ricoverati_con_sintomi',
    'terapia_intensiva',
    'totale_ospedalizzati',
    'variazione_totale_positivi',
    'nuovi_positivi',
    'deceduti',
    'tamponi',
    'ingressi_terapia_intensiva'
    ])

frame_interesting_columns = pd.DataFrame(frame_interesting_columns)
frame_interesting_columns['data'] = pd.to_datetime(frame_interesting_columns['data'])
frame_interesting_columns['data'] = frame_interesting_columns['data'].dt.strftime(r'%Y-%m-%d')
frame_interesting_columns.rename(columns={'data': 'date'}, inplace=True)

In [None]:
frame_interesting_columns.set_index('date',inplace=True)
frame_interesting_columns.set_index(pd.DatetimeIndex(frame_interesting_columns.index))
frame_interesting_columns = frame_interesting_columns.fillna(0)

In [None]:
from statsmodels.tsa.vector_ar.var_model import VAR

n_splits=10
test_size=5

tscv = TimeSeriesSplit(n_splits=n_splits,test_size=test_size)

df = frame_interesting_columns

for train_index, test_index in tscv.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    model = VAR(endog=train,freq='D')
    fit = model.fit()
    lag_order = fit.k_ar

    prediction = fit.forecast(train.values[-lag_order:],len(test_index))

    pred_df = pd.DataFrame(prediction,columns=df.columns)
 
    print('train interval: ' + str(train.index[0]) + ' - ' + str(train.index[-1]))
    print('test interval: ' + str(test.index[0]) + ' - ' + str(test.index[-1]))
    for i in test.columns:
        print('mae value for', i, 'is : ', mean_absolute_error(pred_df[i], test[i]))