In [1]:
import os
import re
import sys
import json
import codecs
import locale
import logging
import requests
import datetime as dt
from typing import Union, Optional, Tuple, List, cast

import tabula
from tabula import read_pdf

import pandas as pd
import numpy as np


In [3]:
#----------------------------------------------------------------
# Configurations section
#----------------------------------------------------------------

# Url of the pdf file to download:
#url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-latest.pdf"

#----------------------------------------------------------------
#
#----------------------------------------------------------------
temp_content_dir = os.path.join(os.sep, 'tmp')

now = dt.datetime.now()
sample_date = now.strftime("%d/%m/%Y")

locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')
ok_statuses = [200, 201, 202]
data_file_path = os.path.join("..","data")
#data_file_path = os.path.join(os.sep,"tmp")

csv_data_file = os.path.join(data_file_path, "italy-regions.csv")

pdf_file_name = os.path.join(os.sep, "tmp", "temp_data_file.pdf")
it_data_file = os.path.join(data_file_path, "virus-it.csv")
it_tmp_data_file = os.path.join(data_file_path, "virus-it-{dt}.csv".format(dt=now.strftime("%Y%m%d")))

lomb_data_file = os.path.join(data_file_path, "virus-lombardia.csv")
lomb_tmp_data_file = os.path.join(data_file_path, "virus-lombardia-{dt}.csv".format(dt=now.strftime("%Y%m%d")))



In [5]:
#----------------------------------------------------------------
# Import the procedures that download and transform the pdf file
# into a dataframe.
#----------------------------------------------------------------

module_path = os.path.abspath(os.path.join('..','src'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from DataDownloader import create_dataframe
from DataDownloader import get_web_file
from DataDownloader import save_content_to_file
from DataDownloader import pdf_to_dataframe
from DataDownloader import translate_to_date
from DataDownloader import refactor_region_df
from DataDownloader import save_df_to_csv
from DataDownloader import init_logger



/home/giovanni/code-personal/python/CoronaVirus/src


In [12]:
from IPython.display import clear_output
clear_output(wait=True)

init_logger('/tmp', "virus-notebook.log",log_level=logging.DEBUG, std_out_log_level=logging.ERROR)

columns_report_charts = ["REPORT DATE","Regione"
                        ,"Ricoverati con sintomi","Terapia intensiva","Totale attualmente positivi"
                        ,"Isolamento domiciliare"
                        ,"CASI TOTALI - A"
                        ,"Totale tamponi effettuati"]
# Pdf format change since: 2020-12-03.
pdf_file_name = "dpc-covid19-ita-scheda-regioni-20201203.pdf"

pdf_file = os.path.join(temp_content_dir, pdf_file_name)

pdf_url = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

#rv, region_df = create_dataframe(pdf_url=pdf_url, local_file_path=pdf_file, pdf_version="v1")
# After 2020/12/03:
rv, region_df = create_dataframe(pdf_url=pdf_url, local_file_path=pdf_file, pdf_version="v6")
if rv == True:
    data_file = os.path.join(os.path.join(data_file_path, "virus_data_file.csv"))
    rv = save_df_to_csv(region_df, data_file, columns_report_charts,"REPORT DATE")


2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,891 data_downloader INFO create_dataframe >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,897 save_df_to_csv INFO get_web_file >>
2020-12-27 19:41:54,904 save_df_to_csv INFO Url: https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-20201203.pdf
2020-12-27 19:41:54,904 save_df_to_csv INFO Url: https://raw.githubuse

In [8]:
columns_report_charts = ["REPORT DATE","Regione"
                                    ,"Ricoverati con sintomi","Terapia intensiva","Totale attualmente positivi"
                                    ,"Isolamento domiciliare"
                                    ,"CASI TOTALI - A"
                                    ,"Totale tamponi effettuati"]
region_df.loc[:,columns_report_charts].head(3)

Unnamed: 0,REPORT DATE,Regione,Ricoverati con sintomi,Terapia intensiva,Totale attualmente positivi,Isolamento domiciliare,CASI TOTALI - A,Totale tamponi effettuati
0,2020-12-03,Lombardia,7025,836,118331,110470,419015,4179837
1,2020-12-03,Piemonte,4459,366,70006,65181,172931,1619813
2,2020-12-03,Campania,2041,162,102574,100371,160569,1639868


In [6]:
#----------------------------------------------------------------
# Download the new dataset and append the new data in the csv
# files.
#----------------------------------------------------------------

pdf_file_name = "dpc-covid19-ita-scheda-regioni-20201202.pdf"

# Pdf format change since: 2020-06-25.
#pdf_file_name = "dpc-covid19-ita-scheda-regioni-20200623.pdf"

pdf_file = os.path.join(temp_content_dir, pdf_file_name)

url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

df_regions = None

rv = False
result = get_web_file(url_region_pdf)
if result[0] == True:
    rv = save_content_to_file(pdf_file, result[1])
else:
    assert False, "File download failure."
    
report_date = None    
if rv == True:
    rv, df, report_date = pdf_to_dataframe(pdf_file)
if rv == True:
    df_regions = df
else:
    assert False, "Unable to transform pdf to dataframe."

assert report_date is not None, "Unable to read data sample date."

rv, df_regions = refactor_region_df(df_regions, report_date, "v6")
assert rv, "Cannot refactor the dataframe."

#df_regions.sort_values(by=['REPORT DATE'], inplace=True)    
# if os.path.isfile(csv_data_file) == False:
#    df_regions.to_csv(csv_data_file, header = True, index=False)
#else:     
#    df_regions.to_csv(csv_data_file, mode='a', header = False, index=False)
    
    

NameError: name 'get_web_file' is not defined

In [7]:
print(csv_data_file)
df = pd.read_csv(csv_data_file, sep=',')
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

../data/italy-regions.csv


((5481, 16), '2020-03-16', '2020-12-01')

In [17]:
df_regions

Unnamed: 0,Regione,Ricoverati con sintomi,Terapia intensiva - TOTALE,Terapia intensiva - INGRESSI del GIORNO,Isolamento domiciliare,Totale attualmente positivi,DIMESSI/GUARITI,DECEDUTI,CASI TOTALI - A,INCREMENTO CASI TOTALI (rispetto al giorno precedente),Totale persone testate,Totale tamponi effettuati,INCREMENTO TAMPONI,REPORT DATE
0,Lombardia,7025,836,32,110470,118331,278058,22626,419015,3751,2383936,4179837,36271,2020-12-03
1,Piemonte,4459,366,10,65181,70006,96473,6452,172931,2230,982816,1619813,26163,2020-12-03
2,Campania,2041,162,0,100371,102574,56177,1818,160569,2295,11187787,1639868,24709,2020-12-03
3,Veneto,2501,307,30,67795,70603,79905,3982,154490,3581,1077351,2841581,21636,2020-12-03
4,Emilia-Romagna,2668,245,14,67324,70237,51674,5959,127870,1766,1097761,2174960,17979,2020-12-03
5,Lazio,3233,364,23,89282,92879,29616,2514,125009,1769,1852335,2288573,22793,2020-12-03
6,Toscana,1467,272,18,33691,35430,67618,2756,105804,929,971080,1606027,14469,2020-12-03
7,Sicilia,1465,221,15,38094,39780,26432,1650,67862,1294,674771,992813,10581,2020-12-03
8,Puglia,1621,226,33,39902,41749,15912,1588,59249,1602,555325,812781,8753,2020-12-03
9,Liguria,932,102,4,10424,11458,38887,2449,52794,422,294667,614263,4957,2020-12-03


In [176]:
#----------------------------------------------------------------
# Download the OLD (version v2) dataset and append the new data 
# in the csv files.
#----------------------------------------------------------------

# Pdf format change since: 2020-06-25 (v2).
# Pdf format change since: 2020-06-06 (v3).
# Pdf format change since: 2020-04-   (v4).
# Pdf format change since: 2020-04-08 (v5).
pdf_file_name = "dpc-covid19-ita-scheda-regioni-20200426.pdf"

pdf_file = os.path.join(temp_content_dir, pdf_file_name)

url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

df_regions = None

rv = False
result = get_web_file(url_region_pdf)
if result[0] == True:
    rv = save_content_to_file(pdf_file, result[1])
else:
    assert False, "File download failure."

report_date = None    
if rv == True:
    rv, df, report_date = pdf_to_dataframe(pdf_file)
if rv == True:
    df_regions = df
else:
    assert False, "Unable to transform pdf to dataframe."

#report_date = None
assert report_date is not None, "Unable to read data sample date."

rv, df_regions = refactor_region_df(df_regions, report_date, "v5")
    
df_regions.sort_values(by=['REPORT DATE'], inplace=True)    
if os.path.isfile(csv_data_file) == False:
    df_regions.to_csv(csv_data_file, header = True, index=False)
else:     
    df_regions.to_csv(csv_data_file, mode='a', header = False, index=False)
    

get_web_file >>
Url: https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-20200426.pdf
get_web_file (True) <<
pdf_to_dataframe (/tmp/dpc-covid19-ita-scheda-regioni-20200426.pdf) >>
RDate: 2020-04-26 00:00:00
pdf_to_dataframe (rv=True - report_date=2020-04-26 00:00:00) <<
refactor_region_df (v5 - 2020-04-26 00:00:00) >>
refactor_region_df (True) <<


In [13]:
#----------------------------------------------------------------
# Data quality tests.
#----------------------------------------------------------------
df_test = pd.read_csv(csv_data_file, sep=',')

rep_date_count = df_test['REPORT DATE'].value_counts().to_frame()
rep_date_count.reset_index(inplace=True)
rep_date_count.sort_values(by=['index'], inplace=True)
#rep_date_count['REPORT DATE'] = rep_date_count['REPORT DATE'].astype(str)
#rep_date_count.loc[rep_date_count['REPORT DATE'] < 21]
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-11")].shape[0] == 30, "Novembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-10")].shape[0] == 31, "Ottobre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-09")].shape[0] == 30, "Settembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-08")].shape[0] == 31, "Agosto"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-07")].shape[0] == 31, "Luglio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-06")].shape[0] == 30, "Giugno"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-05")].shape[0] == 31, "Maggio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-04")].shape[0] == 30, "Aprile"

In [34]:
csv_data_file_reclaimed = os.path.join(data_file_path, "italy-regions-01.csv")

file_to_load = csv_data_file_reclaimed
file_to_load = csv_data_file

print("File to load: {f}".format(f=file_to_load))
df = pd.read_csv(file_to_load , sep=',')

df.sort_values(by=["REPORT DATE"], inplace=True)

mask_region = df["Regione"] == "Lombardia"
mask_date = df["REPORT DATE"] == "2020-04-04"
mask_all = (mask_region) & (mask_date)

selected_cols = ["REPORT DATE"
                ,"Regione","Ricoverati con sintomi","Terapia intensiva", "Totale attualmente positivi"
                ,"DECEDUTI"
                ,"CASI TOTALI - A", "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"
                ,"Totale tamponi effettuati" 
                ]

reclaim = True
# Reclaim: fix the column swap:
if reclaim == True:
    print("Swapping columns ...")
    #df.loc[mask_date, "Totale tamponi effettuati"] = df.loc[mask_date, "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"]
    
    #df.loc[mask_date, "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"] = df.loc[mask_date, "Totale tamponi effettuati"]
    df.loc[mask_date, "Totale tamponi effettuati"] = df.loc[mask_date, "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"]
    df.loc[mask_date, "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"] = np.nan
    df.loc[mask_date, "Casi identificatidal sospettodiagnostico"] = np.nan
    df.loc[mask_date, "Casi identificatida attività discreening"] = np.nan

#df[selected_cols].head(100)
df.loc[mask_date, selected_cols].head(100)
#df.loc[mask_all]

File to load: ../data/italy-regions.csv
Swapping columns ...


Unnamed: 0,REPORT DATE,Regione,Ricoverati con sintomi,Terapia intensiva,Totale attualmente positivi,DECEDUTI,CASI TOTALI - A,INCREMENTO CASI TOTALI (rispetto al giorno precedente),Totale tamponi effettuati
419,2020-04-04,Calabria,178,15.0,662,49,741.0,,12314.0
418,2020-04-04,Sardegna,123,24.0,789,41,874.0,,6789.0
417,2020-04-04,Umbria,167,44.0,927,41,1210.0,,11809.0
416,2020-04-04,Bolzano,291,61.0,1201,146,1592.0,,15045.0
415,2020-04-04,Basilicata,44,19.0,244,11,264.0,,2765.0
414,2020-04-04,Friuli VG,183,50.0,1336,145,1986.0,,21126.0
413,2020-04-04,Lombardia,12002,1326.0,27220,8656,49118.0,,141877.0
411,2020-04-04,Abruzzo,354,71.0,1356,153,1628.0,,12837.0
410,2020-04-04,Sicilia,553,74.0,1726,111,1932.0,,19896.0
412,2020-04-04,Valle d'Aosta,63,23.0,560,82,748.0,,2274.0


In [123]:
df.shape

(5040, 16)

In [35]:
print(file_to_load)
df.to_csv(file_to_load, header = True, index=False)
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

../data/italy-regions.csv


((5145, 16), '2020-03-16', '2020-11-15')

((5082, 16), '2020-03-16', '2020-11-12')