In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import tensorflow as tf
import IPython
import IPython.display
from windows import WindowGenerator
from models import MultiStepLastBaseline
from statsmodels.tsa.vector_ar.var_model import VAR
import sklearn
from sklearn.model_selection import TimeSeriesSplit
from windows import WindowGenerator
from models import MultiStepLastBaseline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.vector_ar.var_model import VAR


# fix for 'package not found' when installing in Anaconda environment
if 'google.colab' not in str(get_ipython()):
    import pip
    pip.main(['install', 'xgboost'])

if 'google.colab' in str(get_ipython()):
  !rm util.py
  !rm window.py
  !rm models.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/util.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/windows.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/models.py


from xgboost import XGBRegressor
from util import select_relevant_rows, select_attributes, read_movement_data,download_updated_mobility_data, download_updated_mobility_data, series_to_supervised


# Data Acquisition and Data Preparation

In [None]:
local_region_path = r'../COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv'
remote_region_path = r'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'

regions_frame = pd.read_csv(remote_region_path)

In [None]:
region_focus = 'Emilia-Romagna'
attribute_focus = 'denominazione_regione'

region_focus_data = select_relevant_rows(
    regions_frame,
    attribute_focus,
    region_focus
    )

In [None]:
frame_interesting_columns = select_attributes(region_focus_data, [
    'data',
    'ricoverati_con_sintomi',
    'terapia_intensiva',
    'totale_ospedalizzati',
    'variazione_totale_positivi',
    'nuovi_positivi',
    'deceduti',
    'tamponi',
    'ingressi_terapia_intensiva'
    ])

In [None]:
frame_interesting_columns.tail()

In [None]:
frame_interesting_columns = pd.DataFrame(frame_interesting_columns)
frame_interesting_columns['data'] = pd.to_datetime(frame_interesting_columns['data'])
frame_interesting_columns['data'] = frame_interesting_columns['data'].dt.strftime(r'%Y-%m-%d')
frame_interesting_columns = frame_interesting_columns.fillna(0)


In [None]:
mobility_data_url = r'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
file_path = r'../Global_Mobility_Report.csv'
mobility_data_zip_url = r'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
zip_path = r'../Region_Mobility_Report_CSVs.zip'
region_mobility_path = r'../Region_Mobility_Report_CSVs'

download_updated_mobility_data(
    mobility_data_url,
    file_path,
    region_mobility_path,
    mobility_data_zip_url,
    zip_path
    )

mobility_df = read_movement_data(
    region_mobility_path,
    'IT_Region_Mobility_Report',
    region='Emilia-Romagna'
    )

In [None]:
mobility_df = mobility_df[['date',
            'retail_and_recreation_percent_change_from_baseline',
            'grocery_and_pharmacy_percent_change_from_baseline',
            'parks_percent_change_from_baseline',
            'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline',
            'residential_percent_change_from_baseline']].fillna(0)

In [None]:
frame_interesting_columns.rename(columns={'data': 'date'}, inplace=True)
frame_interesting_columns.set_index('date',inplace=True)
mobility_df.set_index('date',inplace=True)
merged = pd.merge(
    frame_interesting_columns,
    mobility_df,
    on='date'
    )
merged = merged.fillna(0)
merged.set_index(pd.DatetimeIndex(merged.index))

In [None]:
fig, axes = pyplot.subplots(nrows=4,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(frame_interesting_columns[frame_interesting_columns.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(frame_interesting_columns.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
fig, axes = pyplot.subplots(nrows=3,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(mobility_df[mobility_df.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(mobility_df.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
df = merged
n = len(merged)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]
num_features

# Univariate XGBOOST

In [None]:
column_univariate = merged['nuovi_positivi']

reframed = series_to_supervised(pd.DataFrame(column_univariate),window=7)

In [None]:
reframed

In [None]:
def train_test_split(data,n_test):
	return data[:-n_test, :], data[-n_test:, :]

def walk_forward_validation(data,n_test):
    predictions = list()
    train, test = train_test_split(data,n_test)
    history = [x for x in train]
    for i in range(len(test)):
        testX, testy = test[i, :-1], test[i, -1]
        prediction = xgboost_forecast(history, testX)
        predictions.append(prediction)
        history.append(test[i])
        print('expected=%.1f, predicted=%.1f' % (testy, prediction))
    error = mean_absolute_error(test[:, -1], predictions)
    return error, test[:, 1], predictions

def xgboost_forecast(train, testX):
    train = np.ascontiguousarray(train)
    testX = np.ascontiguousarray(testX)
    trainX, trainy = train[:, :-1], train[:, -1]
    model = XGBRegressor(objective='reg:squarederror',n_estimators=1000)
    model.fit(trainX, trainy)
    prediction = model.predict([testX])
    return prediction[0]
    return 0

In [None]:
mae, y, yhat = walk_forward_validation(reframed.values,7)

# VAR (Vector Auto Regression)

In [None]:
n_splits=10
test_size=5

tscv = TimeSeriesSplit(n_splits=n_splits,test_size=test_size)

df = frame_interesting_columns

for train_index, test_index in tscv.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    model = VAR(endog=train,freq='D')
    fit = model.fit()
    lag_order = fit.k_ar

    prediction = fit.forecast(train.values[-lag_order:],len(test_index))

    pred_df = pd.DataFrame(prediction,columns=df.columns)
 
    print('train interval: ' + str(train.index[0]) + ' - ' + str(train.index[-1]))
    print('test interval: ' + str(test.index[0]) + ' - ' + str(test.index[-1]))
    for i in test.columns:
        print('mae value for', i, 'is : ', mean_absolute_error(pred_df[i], test[i]))


# LSTMS(Long Short Term Memory)

In [None]:
merged.drop('tamponi',axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(merged)

In [None]:
reframed = series_to_supervised(merged, 1, 1)
values = reframed.values
train = values[:400, :]
test = values[400:, :]

train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [None]:
reframed

In [None]:
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()