In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import tensorflow as tf
import IPython
import IPython.display
from statsmodels.tsa.vector_ar.var_model import VAR
import sklearn
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.vector_ar.var_model import VAR
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

# fix for 'package not found' when installing in Anaconda environment
if 'google.colab' not in str(get_ipython()):
    import pip
    pip.main(['install', 'xgboost'])

if 'google.colab' in str(get_ipython()):
  !rm util.py
  !rm window.py
  !rm models.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/util.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/windows.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/models.py

from xgboost import XGBRegressor
from util import select_relevant_rows, select_attributes, read_movement_data,download_updated_mobility_data, download_updated_mobility_data, series_to_supervised
from windows import WindowGenerator
from models import MultiStepLastBaseline

# if sklearn.__version__ != '0.24':
#   !pip3 install scikit-learn==0.24

# Data Acquisition and Data Preparation

In [None]:
local_region_path = r'../COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv'
remote_region_path = r'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'

regions_frame = pd.read_csv(remote_region_path)

In [None]:
region_focus = 'Emilia-Romagna'
attribute_focus = 'denominazione_regione'

region_focus_data = select_relevant_rows(
    regions_frame,
    attribute_focus,
    region_focus
    )

In [None]:
frame_interesting_columns = select_attributes(region_focus_data, [
    'data',
    'ricoverati_con_sintomi',
    'terapia_intensiva',
    'totale_ospedalizzati',
    'variazione_totale_positivi',
    'nuovi_positivi',
    'deceduti',
    'tamponi',
    'ingressi_terapia_intensiva'
    ])

In [None]:
frame_interesting_columns.tail()

In [None]:
frame_interesting_columns = pd.DataFrame(frame_interesting_columns)
frame_interesting_columns['data'] = pd.to_datetime(frame_interesting_columns['data'])
frame_interesting_columns['data'] = frame_interesting_columns['data'].dt.strftime(r'%Y-%m-%d')
frame_interesting_columns = frame_interesting_columns.fillna(0)


In [None]:
mobility_data_url = r'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
file_path = r'../Global_Mobility_Report.csv'
mobility_data_zip_url = r'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
zip_path = r'../Region_Mobility_Report_CSVs.zip'
region_mobility_path = r'../Region_Mobility_Report_CSVs'

download_updated_mobility_data(
    mobility_data_url,
    file_path,
    region_mobility_path,
    mobility_data_zip_url,
    zip_path
    )

mobility_df = read_movement_data(
    region_mobility_path,
    'IT_Region_Mobility_Report',
    region='Emilia-Romagna'
    )

In [None]:
mobility_df = mobility_df[['date',
            'retail_and_recreation_percent_change_from_baseline',
            'grocery_and_pharmacy_percent_change_from_baseline',
            'parks_percent_change_from_baseline',
            'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline',
            'residential_percent_change_from_baseline']].fillna(0)

In [None]:
frame_interesting_columns.rename(columns={'data': 'date'}, inplace=True)
frame_interesting_columns.set_index('date',inplace=True)
mobility_df.set_index('date',inplace=True)
merged = pd.merge(
    frame_interesting_columns,
    mobility_df,
    on='date'
    )
merged = merged.fillna(0)
merged.set_index(pd.DatetimeIndex(merged.index))

In [None]:
fig, axes = pyplot.subplots(nrows=4,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(frame_interesting_columns[frame_interesting_columns.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(frame_interesting_columns.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
fig, axes = pyplot.subplots(nrows=3,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(mobility_df[mobility_df.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(mobility_df.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
df = merged
n = len(merged)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]

# Univariate XGBOOST

In [None]:
column_univariate = 'nuovi_positivi'
df = pd.DataFrame(frame_interesting_columns[column_univariate])
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(df)

values = scaled

look_back = 7

reframed = series_to_supervised(pd.DataFrame(values),window=look_back)

In [None]:
reframed

In [None]:
def split_for_testing(data,n_test):
	return data[:-n_test, :], data[-n_test:, :]

def walk_forward_validation(data,n_test):
    predictions = list()
    train, test = split_for_testing(data,n_test)
    history = [x for x in train]
    for i in range(len(test)):
        test_X, test_y = test[i, :-1], test[i, -1]
        prediction = xgboost_forecast(history,test_X)
        predictions.append(prediction)
        history.append(test[i])
    error = mean_absolute_error(test[:, -1],predictions)
    return error, test[:, 1], predictions

def xgboost_forecast(train,test_X):
    train = np.ascontiguousarray(train)
    test_X = np.ascontiguousarray(test_X)
    train_X, train_y = train[:, :-1], train[:, -1]
    model = XGBRegressor(objective='reg:squarederror',n_estimators=1000)
    model.fit(train_X, train_y)
    prediction = model.predict([test_X])
    return prediction[0]

In [None]:
mae, y, yhat = walk_forward_validation(reframed.values,look_back)

In [None]:
print('MAE: %.3f' % mae)
pyplot.plot(y,label='Expected')
pyplot.plot(yhat,label='Predicted')
pyplot.legend()
pyplot.show()

# VAR (Vector Auto Regression)

In [None]:
# n_splits=10
# test_size=5

# tscv = TimeSeriesSplit(n_splits=n_splits,test_size=test_size)

# df = frame_interesting_columns

# for train_index, test_index in tscv.split(df):
#     train = df.iloc[train_index]
#     test = df.iloc[test_index]

#     model = VAR(endog=train,freq='D')
#     fit = model.fit()
#     lag_order = fit.k_ar

#     prediction = fit.forecast(train.values[-lag_order:],len(test_index))

#     pred_df = pd.DataFrame(prediction,columns=df.columns)
 
#     print('train interval: ' + str(train.index[0]) + ' - ' + str(train.index[-1]))
#     print('test interval: ' + str(test.index[0]) + ' - ' + str(test.index[-1]))
#     for i in test.columns:
#         print('mae value for', i, 'is : ', mean_absolute_error(pred_df[i], test[i]))


# LSTM Univariate


In [None]:
column_univariate = 'nuovi_positivi'
df = pd.DataFrame(frame_interesting_columns[column_univariate])
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(df)

look_back = 7

split_percent = 0.80
split = int(split_percent*len(scaled))

train = scaled[:split,:]
test = scaled[split:,]

date_train = df.index[:split]
date_test = df.index[split:]
date_prediction = df.index[split+look_back:]

train_reframed = series_to_supervised(pd.DataFrame(train),look_back,1).values
test_reframed = series_to_supervised(pd.DataFrame(test),look_back,1).values

train_X, train_y = train_reframed[:,:-1], train_reframed[:,-1]
test_X, test_y = test_reframed[:,:-1], test_reframed[:,-1]

train_X = train_X.reshape((train_X.shape[0],train_X.shape[1],1))
test_X = test_X.reshape((test_X.shape[0],test_X.shape[1],1))

model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae',optimizer='adam')

history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X,test_y),verbose=0,shuffle=False)

pyplot.plot(history.history['loss'],label='Training loss')
pyplot.plot(history.history['val_loss'],label='Validation loss')
pyplot.legend()
pyplot.show()

In [None]:
prediction = model.predict(test_X)

train = scaler.inverse_transform(train).reshape(-1)
test = scaler.inverse_transform(test).reshape(-1)
prediction = scaler.inverse_transform(prediction).reshape(-1)

print('mae:')
print(mean_absolute_error(test_y,prediction))

data_trace = go.Scatter(x=date_train,y=train,mode='lines',name='Data')
prediction_trace = go.Scatter(x=date_prediction,y=prediction,mode='lines',name='Prediction')
truth_trace = go.Scatter(x=date_test,y=test,mode='lines',name='Ground Truth')
layout = go.Layout(title="nuovi_positivi",xaxis={'title':"Date"},yaxis={'title':"nuovi_positivi"})

fig = go.Figure(data=[data_trace,prediction_trace,truth_trace],layout=layout)
fig.show()

#Univariate LSTM with TimeseriesGenerator

In [None]:
df = pd.DataFrame(frame_interesting_columns['nuovi_positivi'])

scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(df.values)

values = scaled

split_percent = 0.80
split = int(split_percent*len(values))

look_back = 7

train = values[:split]
test = values[split:]

date_train = df.index[:split]
date_test = df.index[split:]
date_prediction = df.index[split+look_back:]

train_generator = TimeseriesGenerator(train,train,length=look_back,batch_size=20)    
test_generator = TimeseriesGenerator(test,test,length=look_back,batch_size=1)

model = Sequential()
model.add(LSTM(50,activation='relu',input_shape=(look_back,1)))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mae')

num_epochs = 50

history = model.fit(train_generator,validation_data=test_generator,epochs=num_epochs,verbose=0)

pyplot.plot(history.history['loss'],label='Training loss')
pyplot.plot(history.history['val_loss'],label='Validation loss')
pyplot.legend()
pyplot.show()

In [None]:
prediction = model.predict(test_generator)

train = scaler.inverse_transform(train).reshape(-1)
test = scaler.inverse_transform(test).reshape(-1)
prediction = scaler.inverse_transform(prediction).reshape(-1)

data_trace = go.Scatter(x=date_train,y=train,mode='lines',name='Data')
prediction_trace = go.Scatter(x=date_prediction,y=prediction,mode='lines',name='Prediction')
truth_trace = go.Scatter(x=date_test,y=test,mode='lines',name='Ground Truth')
layout = go.Layout(title="nuovi_positivi",xaxis={'title':"Date"},yaxis={'title':"nuovi_positivi"})

fig = go.Figure(data=[data_trace,prediction_trace,truth_trace],layout=layout)
fig.show()

# first seven steps used for lags to avoid using last 7 steps for 
# training in order to avoid using traing data for testing
print('mae:')
print(mean_absolute_error(test[look_back:],prediction))


# Multivariate LSTM

In [None]:
df = pd.DataFrame(frame_interesting_columns.drop('tamponi',axis=1))
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(df)

split_percent = 0.80
split = int(split_percent*len(scaled))

look_back = 7
n_future = 1

train = scaled[:split]
test = scaled[split:]

column_to_predict = 'nuovi_positivi'
index_to_predict = df.columns.get_loc(column_to_predict)

date_train = df.index[:split]
date_test = df.index[split:]
date_prediction = df.index[split+look_back:]

In [None]:
train_X, test_X = [], []
train_y, test_y = [], []

for i in range(look_back,len(train) - n_future + 1):
    train_X.append(train[i - look_back:i,0:train.shape[1]])
    train_y.append(train[i + n_future - 1:i + n_future,index_to_predict])

for i in range(look_back,len(test) - n_future + 1):
    test_X.append(test[i - look_back:i,0:test.shape[1]])
    test_y.append(test[i + n_future - 1:i + n_future,index_to_predict])

train_X, train_y = np.array(train_X), np.array(train_y)
test_X, test_y = np.array(test_X), np.array(test_y)

model = Sequential()
model.add(LSTM(64,activation='relu',input_shape=(train_X.shape[1],train_X.shape[2]),return_sequences=True))
model.add(LSTM(32,activation='relu',return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(train_y.shape[1]))

model.compile(optimizer='adam',loss='mse')

history = model.fit(train_X,train_y,epochs=num_epochs,batch_size=16,validation_split=0.2,verbose=0)

pyplot.plot(history.history['loss'],label='Training loss')
pyplot.plot(history.history['val_loss'],label='Validation loss')
pyplot.legend()

In [None]:
prediction = model.predict(test_X)

prediction_copies = np.repeat(prediction,df.shape[1],axis=-1)
test_copies = np.repeat(pd.DataFrame(test_y).values,df.shape[1],axis=-1)

prediction_descaled = scaler.inverse_transform(prediction_copies)[:,0]
test_descaled = scaler.inverse_transform(test_copies)[:,0]

train = scaler.inverse_transform(scaled)[:,index_to_predict].reshape(-1)
test = test_descaled.reshape((-1))
prediction = prediction_descaled.reshape((-1))

data_trace = go.Scatter(x=date_train,y=train,mode='lines',name='Data')
prediction_trace = go.Scatter(x=date_prediction,y=prediction,mode='lines',name='Prediction')
truth_trace = go.Scatter(x=date_test,y=test,mode='lines',name='Ground Truth')
layout = go.Layout(title="nuovi_positivi",xaxis={'title':"Date"},yaxis={'title':"nuovi_positivi"})

fig = go.Figure(data=[data_trace,prediction_trace,truth_trace],layout=layout)
fig.show()

# first seven steps used for lags to avoid using last 7 steps for 
# training in order to avoid using traing data for testing
print('mae:')
print(mean_absolute_error(test_y,prediction))

print(len(train))
print(len(test))

# Multivariate LSTM with TimeseriesGenerator

In [None]:
df = pd.DataFrame(frame_interesting_columns.drop('tamponi',axis=1))
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(df.values)

values = scaled

column_to_predict = 'nuovi_positivi'
index_to_predict = df.columns.get_loc(column_to_predict)

look_back = 7

date_train = df.index[:split]
date_test = df.index[split:]
date_prediction = df.index[split+look_back:]

# X contains all columns with the last $look_back readings removed
# y contains the column to be predicted withouth the first $look_back readings
X = values[:][:-look_back]
y = values[:,index_to_predict][look_back:]

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.20,random_state=42,shuffle=False)

train_generator = TimeseriesGenerator(train_X,train_y,sampling_rate=1,length=look_back,batch_size=look_back)
test_generator = TimeseriesGenerator(test_X,test_y,sampling_rate=1,length=look_back,batch_size=look_back)

model = Sequential()
model.add(LSTM(64,activation='relu',input_shape=(look_back,scaled.shape[1]),return_sequences=True))
model.add(LSTM(32,activation='relu',return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')
model.summary()

In [None]:
history = model.fit(train_generator,validation_data=test_generator,epochs=num_epochs,verbose=0)

pyplot.plot(history.history['loss'],label='Training loss')
pyplot.plot(history.history['val_loss'],label='Validation loss')
pyplot.legend()

In [None]:
prediction = model.predict(test_generator)

prediction_copies = np.repeat(prediction,df.shape[1],axis=-1)
test_copies = np.repeat(pd.DataFrame(test_y).values,df.shape[1],axis=-1)

prediction_descaled = scaler.inverse_transform(prediction_copies)[:,0]
test_descaled = scaler.inverse_transform(test_copies)[:,0]

train = scaler.inverse_transform(values)[:,index_to_predict].reshape(-1)
test = test_descaled.reshape((-1))
prediction = prediction_descaled.reshape((-1))

data_trace = go.Scatter(x=date_train,y=train,mode='lines',name='Data')
prediction_trace = go.Scatter(x=date_prediction,y=prediction,mode='lines',name='Prediction')
truth_trace = go.Scatter(x=date_test,y=test,mode='lines',name='Ground Truth')
layout = go.Layout(title="nuovi_positivi",xaxis={'title':"Date"},yaxis={'title':"nuovi_positivi"})

fig = go.Figure(data=[data_trace,prediction_trace,truth_trace],layout=layout)
fig.show()

# first seven steps used for lags to avoid using last 7 steps for 
# training in order to avoid using traing data for testing
print('mae:')
print(mean_absolute_error(test[look_back:],prediction))