In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

# fix for 'package not found' when installing in Anaconda environment
if 'google.colab' not in str(get_ipython()):
    import pip
    pip.main(['install', 'xgboost'])

if 'google.colab' in str(get_ipython()):
  !rm util.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/util.py

from xgboost import XGBRegressor
from util import select_relevant_rows, select_attributes, read_movement_data, read_multiple_csv,download_updated_mobility_data, download_updated_mobility_data, time_series_cross_validation, train_and_predict, select_time_slot, series_to_supervised

In [None]:
local_region_path = r'../COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv'
remote_region_path = r'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'

regions_frame = pd.read_csv(remote_region_path)

In [None]:
region_focus = 'Emilia-Romagna'
attribute_focus = 'denominazione_regione'

region_focus_data = select_relevant_rows(
    regions_frame,
    attribute_focus,
    region_focus
    )

In [None]:
frame_interesting_columns = select_attributes(region_focus_data, [
    'data',
    'ricoverati_con_sintomi',
    'terapia_intensiva',
    'totale_ospedalizzati',
    'variazione_totale_positivi',
    'nuovi_positivi',
    'deceduti',
    'tamponi',
    'ingressi_terapia_intensiva'
    ])

In [None]:
frame_interesting_columns.head()

In [None]:
frame_interesting_columns = pd.DataFrame(frame_interesting_columns)
frame_interesting_columns['data'] = pd.to_datetime(frame_interesting_columns['data'])
frame_interesting_columns['data'] = frame_interesting_columns['data'].dt.strftime(r'%Y-%m-%d')
frame_interesting_columns = frame_interesting_columns.fillna(0)


In [None]:
mobility_data_url = r'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
file_path = r'../Global_Mobility_Report.csv'
mobility_data_zip_url = r'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
zip_path = r'../Region_Mobility_Report_CSVs.zip'
region_mobility_path = r'../Region_Mobility_Report_CSVs'

download_updated_mobility_data(
    mobility_data_url,
    file_path,
    region_mobility_path,
    mobility_data_zip_url,
    zip_path
    )

mobility_df = read_movement_data(
    region_mobility_path,
    'IT_Region_Mobility_Report',
    region='Emilia-Romagna'
    )

In [None]:
mobility_df = mobility_df[['date',
            'retail_and_recreation_percent_change_from_baseline',
            'grocery_and_pharmacy_percent_change_from_baseline',
            'parks_percent_change_from_baseline',
            'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline',
            'residential_percent_change_from_baseline']].fillna(0)

In [None]:
frame_interesting_columns.rename(columns={'data': 'date'}, inplace=True)
frame_interesting_columns.set_index('date',inplace=True)
mobility_df.set_index('date',inplace=True)
merged = pd.merge(
    frame_interesting_columns,
    mobility_df,
    on='date'
    )
merged = merged.fillna(0)
merged.set_index(pd.DatetimeIndex(merged.index))

In [None]:
fig, axes = pyplot.subplots(nrows=4,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(frame_interesting_columns[frame_interesting_columns.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(frame_interesting_columns.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
fig, axes = pyplot.subplots(nrows=3,ncols=2,figsize=(10,8))
for i, ax in enumerate(axes.flatten()):
    data = np.array(mobility_df[mobility_df.columns[i]])
    ax.plot(pd.DataFrame(data))
    ax.set_title(mobility_df.columns[i])
    ax.plot()

pyplot.tight_layout()


In [None]:
def series_to_supervised(data,n_in=1,n_out=1,dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [(data.columns[j] + '(t-%d)' % i) for j in range(n_vars)]
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [(data.columns[j] + '(t)') for j in range(n_vars)]
        else:
            names += [(data.columns[j] + '(t+%d)' % i) for j in range(n_vars)]
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg

reframed = series_to_supervised(merged, 15, 5)

reframed.head()