In [None]:
import pandas as pd
import numpy as np
from datetime import date
import csv 
import requests
import glob
from pandas import DataFrame, concat
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import requests
import os.path, time

# fix for 'package not found' when installing in Anaconda environment
if 'google.colab' not in str(get_ipython()):
    import pip
    pip.main(['install', 'xgboost'])

if 'google.colab' in str(get_ipython()):
  !rm util.py
  !wget https://raw.githubusercontent.com/marco-mazzoli/progetto-tesi/master/util.py

from xgboost import XGBRegressor
from util import select_relevant_rows, select_attributes, read_movement_data, read_multiple_csv,download_updated_mobility_data, download_updated_mobility_data, train_and_predict, select_optimal_window


In [None]:
local_region_path = r'../COVID-19/dati-regioni/dpc-covid19-ita-regioni.csv'
remote_region_path = r'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'

regions_frame = pd.read_csv(remote_region_path)

In [None]:
region_focus = 'Emilia-Romagna'
attribute_focus = 'denominazione_regione'

region_focus_data = select_relevant_rows(
    regions_frame,
    attribute_focus,
    region_focus
    )

In [None]:
list(region_focus_data)

In [None]:
frame_interesting_columns = select_attributes(region_focus_data, [
    'data',
    'ricoverati_con_sintomi',
    'terapia_intensiva',
    'totale_ospedalizzati',
    'variazione_totale_positivi',
    'nuovi_positivi',
    'deceduti',
    'tamponi',
    'ingressi_terapia_intensiva'
    ])

In [None]:
frame_interesting_columns.head()

In [None]:
frame_interesting_columns = pd.DataFrame(frame_interesting_columns)
frame_interesting_columns['data'] = pd.to_datetime(frame_interesting_columns['data'])
frame_interesting_columns['data'] = frame_interesting_columns['data'].dt.strftime(r'%Y-%m-%d')


In [None]:
# 'tamponi' is cumulative, revert to incidence
columns_to_normalize = ['deceduti', 'tamponi']

for column in columns_to_normalize:
    frame_interesting_columns[column] = frame_interesting_columns[column].transform(
        lambda s: s.sub(s.shift().fillna(0)).abs()
        )


In [None]:
mobility_data_url = r'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
file_path = r'../Global_Mobility_Report.csv'
mobility_data_zip_url = r'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
zip_path = r'../Region_Mobility_Report_CSVs.zip'
region_mobility_path = r'../Region_Mobility_Report_CSVs'

download_updated_mobility_data(
    mobility_data_url,
    file_path,
    region_mobility_path,
    mobility_data_zip_url,
    zip_path
    )

mobility_df = read_movement_data(
    region_mobility_path,
    'IT_Region_Mobility_Report',
    region='Emilia-Romagna'
    )


In [None]:
mobility_df.shape

In [None]:
mobility_df = mobility_df[['date',
            'retail_and_recreation_percent_change_from_baseline',
            'grocery_and_pharmacy_percent_change_from_baseline',
            'parks_percent_change_from_baseline',
            'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline',
            'residential_percent_change_from_baseline']]

In [None]:
mobility_df

In [None]:
frame_interesting_columns.rename(columns={'data': 'date'}, inplace=True)
frame_interesting_columns.set_index('date',inplace=True)
mobility_df.set_index('date',inplace=True)
merged = pd.merge(
    frame_interesting_columns,
    mobility_df,
    on='date'
    )
merged = merged.fillna(0)

In [None]:
list(merged)

In [None]:
values = merged.values

groups = range(8)
i = 1
pyplot.figure()
for group in groups:
	pyplot.subplot(len(groups),1,i)
	pyplot.plot(values[:,group])
	pyplot.title(merged.columns[group],y=0.5,loc='left')
	i += 1
pyplot.show()

groups = range(8,14)
i = 1
pyplot.figure()
for group in groups:
	pyplot.subplot(len(groups),1,i)
	pyplot.plot(values[:,group])
	pyplot.title(merged.columns[group],y=0.5,loc='left')
	i += 1
pyplot.show()

In [None]:
# try with mobility data
column_to_predict = 'terapia_intensiva'

n_days = 375
n_predictions = 21

y_pred, y_test = train_and_predict(
    merged,
    column_to_predict,
    n_days,
    n_predictions
    )

In [None]:
print('MAE: ', mean_absolute_error(y_test, y_pred))
pyplot.plot(y_test.values,label='real')
pyplot.plot(y_pred,label='predicted')
pyplot.ylabel(column_to_predict)
pyplot.legend(loc='best')
pyplot.show()

In [None]:
# try with no mobility data
column_to_predict = 'terapia_intensiva'

n_days = 375
n_predictions = 21

y_pred, y_test = train_and_predict(
    frame_interesting_columns,
    column_to_predict,
    n_days,
    n_predictions
    )

In [None]:
print('MAE: ', mean_absolute_error(y_test, y_pred))
pyplot.plot(y_test.values,label='real')
pyplot.plot(y_pred,label='predicted')
pyplot.ylabel(column_to_predict)
pyplot.legend(loc='best')
pyplot.show()

In [None]:
max_window_prediction = 25

result_with_mobility = select_optimal_window(
    merged,
    'terapia_intensiva',
    max_window_prediction
    )
result_without_mobility = select_optimal_window(
    frame_interesting_columns,
    'terapia_intensiva',
    max_window_prediction
    )

In [None]:
pyplot.plot(result_with_mobility['mae'],label='mobility')
pyplot.plot(result_without_mobility['mae'],label='no mobility')
pyplot.legend(loc='best')
pyplot.show()