In [1]:
import os
import re
import json
import codecs
import locale
import requests
import datetime
from typing import Any, Tuple, Dict

import tabula
from tabula import read_pdf

import pandas as pd
import numpy as np


In [2]:
#----------------------------------------------------------------
# Configurations section
#----------------------------------------------------------------

# Url of the pdf file to download:
#url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-latest.pdf"

#----------------------------------------------------------------
#
#----------------------------------------------------------------
temp_content_dir = os.path.join(os.sep, 'tmp')

now = datetime.datetime.now()
sample_date = now.strftime("%d/%m/%Y")

locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')
ok_statuses = [200, 201, 202]
data_file_path = os.path.join("..","data")
#data_file_path = os.path.join(os.sep,"tmp")

csv_data_file = os.path.join(data_file_path, "italy-regions.csv")

pdf_file_name = os.path.join(os.sep, "tmp", "temp_data_file.pdf")
it_data_file = os.path.join(data_file_path, "virus-it.csv")
it_tmp_data_file = os.path.join(data_file_path, "virus-it-{dt}.csv".format(dt=now.strftime("%Y%m%d")))

lomb_data_file = os.path.join(data_file_path, "virus-lombardia.csv")
lomb_tmp_data_file = os.path.join(data_file_path, "virus-lombardia-{dt}.csv".format(dt=now.strftime("%Y%m%d")))



In [3]:
#----------------------------------------------------------------
#
#----------------------------------------------------------------

def get_web_file(url:str):
    """
    
    :param url: 
    :return (rv, content):
    """
    print("get_web_file >>")
    print("Url: {u}".format(u=url))
    rv = False
    result = None
    try:
        result = requests.get(url_region_pdf)
        if result.status_code not in ok_statuses:
            print("Get data failed. Received error code: {er}".format(er=str(result.status_code)))
        else:
            result = result.content
    except Exception as ex:
        print("get_web_file failed - {ex}".format(ex=ex))
    else:
        rv = True
    print("get_web_file ({rv}) <<".format(rv=rv))
    return (rv, result)    
        
def save_content_to_file(file_name, content):
    """
    
    :param file_name: 
    :return rv:
    """
    rv = False
    try:
        with open(file_name, "wb") as fh:
            fh.write(content)
    except Exception as ex:
        print("save_content_to_file failed - {ex}".format(ex=ex))
    else:
        rv = True
    return rv
   
def pdf_to_dataframe(pdf_file_name:str) -> Tuple[bool, Any, Any]:
    """
    
    :param pdf_file_name: 
    :return rv, df, report_date: 
    """
    print("pdf_to_dataframe ({fn}) >>".format(fn=pdf_file_name))
    rv = False
    df = None
    report_date = None
    try:
        df = tabula.read_pdf(pdf_file_name, pages='all')
        #print("Df list len: {l}".format(l=len(df)))
        
        csv_file = os.path.splitext(pdf_file_name)[0] + ".csv"
        tabula.convert_into(pdf_file_name, csv_file, output_format="csv", pages='all')
        list_reg = [] 
        with open(csv_file, "r") as fh:
            start = False
            end = False
            reg = re.compile("(\d{1,3}) (\d)")
            for line in fh:
                if line.startswith("Lombardia") == True:
                    start = True
                if line.startswith("TOTALE") == True:
                    end = True
                    start = False
                if start == True:
                    line = line.replace(".", "")
                    line = line.replace("+ ", "")
                    #line = line.replace(" ", ",")
                    line = reg.sub("\\1,\\2", line)
                    line = line.replace("\n", "")
                    list_reg.append(line)
                if 'Aggiornamento casi Covid-19' in line:
                    parts = line.split(" - ")
                    if len(parts) > 1:
                        report_date = parts[0]
                        if parts[0][0] == "\"":
                            report_date = parts[0][1:]
                        #print(report_date)
                        rv, report_date = translate_to_date(report_date.split(" "))
                elif 'AGGIORNAMENTO ' in line:
                    parts = line.split(" ")
                    if len(parts) > 1:
                        report_date = datetime.datetime.strptime(parts[1], '%d/%m/%Y')
                        print("RDate: {rd}".format(rd=report_date))
        
        df = pd.DataFrame([line.split(",") for line in list_reg])
        rv = True
        
    except Exception as ex:
        print("pdf_to_dataframe failed - {ex}".format(ex=ex))
    print("pdf_to_dataframe (rv={rv} - report_date={rd}) <<".format(rv=rv, rd=report_date))
    return (rv, df, report_date)

def translate_to_date(dt):
    #print("translate_to_date {p} >>".format(p=str(dt)))
    rv = False
    date = None
    months_names = {
        "gennaio":    1
        ,"febbraio":  2
        ,"marzo":     3
        ,"aprile":    4
        ,"maggio":    5
        ,"giugno":    6
        ,"luglio":    7
        ,"agosto":    8
        ,"settembre": 9
        ,"ottobre":  10
        ,"novembre": 11
        ,"dicembre": 12
    }
    if len(dt) >= 3 :
        try:
            day = dt[0]
            year = dt[2]
            month = months_names.get(dt[1].lower())
            if month is not None:
                #print("Dt: {d}/{m}/{y}".format(d=day,m=month,y=year))
                date = datetime.datetime(year=int(year), month=int(month), day=int(day))
                rv = True
            else:
                date = Exception("Unknown month: {m}".format(m=dt[1]))
        except Exception as ex:
            print("Exception - {e}".format(e=ex))
            date = ex
    else:
        date = Exception("Wrong format: {dt}".format(dt=str(dt)))
    #print("translate_to_date rv:{rv} - dt:{dt} <<".format(rv=rv,dt=str(date)))
    return (rv, date)
    
def refactor_region_df(df:pd.DataFrame, report_date:datetime.datetime, pdf_version:str="v1") -> Tuple[bool, Any]:
    """
    
    :param df: 
    :param repord_date:
    :pdf_version: valid values are v1, v2, v3;
    :return (rv, df_region):
    """
    print("refactor_region_df ({dt}) >>".format(dt=report_date))
    rv = False
    df_res = None
    try:
        if pdf_version == "v1":
            df_res = df
            df_res.rename(columns={df_res.columns[ 0]: "Regione"
                                  ,df_res.columns[ 1]: "Ricoverati con sintomi"
                                  ,df_res.columns[ 2]: "Terapia intensiva"
                                  ,df_res.columns[ 3]: "Isolamento domiciliare"
                                  ,df_res.columns[ 4]: "Totale attualmente positivi"
                                  ,df_res.columns[ 5]: "DIMESSI/GUARITI"
                                  ,df_res.columns[ 6]: "DECEDUTI"
                                  ,df_res.columns[ 7]: "CASI TOTALI - A"
                                  ,df_res.columns[ 8]: "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"
                                  ,df_res.columns[ 9]: "Casi identificatidal sospettodiagnostico"
                                  ,df_res.columns[10]: "Casi identificatida attività discreening"
                                  ,df_res.columns[11]: "CASI TOTALI - B"
                                  ,df_res.columns[12]: "Totale casi testati"
                                  ,df_res.columns[13]: "Totale tamponi effettuati"
                                  ,df_res.columns[14]: "INCREMENTO TAMPONI" 
                          },
                      inplace = True)
            rv = True  
            df_res["REPORT DATE"] = report_date #pd.to_datetime(report_date, format="%d/%m/%Y")
            #df_res.set_index("Regione", inplace=True)
        elif pdf_version in ["v2", "v3"]:
            df_res = df
            if pdf_version == "v3" and len(df.columns) == 12:
                df.drop([10], axis=1, inplace=True)
            df_res.rename(columns={df_res.columns[ 0]: "Regione"
                                  ,df_res.columns[ 1]: "Ricoverati con sintomi"
                                  ,df_res.columns[ 2]: "Terapia intensiva"
                                  ,df_res.columns[ 3]: "Isolamento domiciliare"
                                  ,df_res.columns[ 4]: "Totale attualmente positivi"
                                  ,df_res.columns[ 5]: "DIMESSI/GUARITI"
                                  ,df_res.columns[ 6]: "DECEDUTI"
                                  ,df_res.columns[ 7]: "CASI TOTALI - A"
                                  ,df_res.columns[ 8]: "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"
                                  ,df_res.columns[ 9]: "Totale tamponi effettuati" 
                                  ,df_res.columns[10]: "Totale casitestati" 
                          },
                      inplace = True)
            
            df_res["Casi identificatidal sospettodiagnostico"] = np.nan
            df_res["Casi identificatida attività discreening"] = np.nan
            df_res["CASI TOTALI - B"] = np.nan
            df_res["INCREMENTO TAMPONI"] = np.nan

            rv = True  
            df_res["REPORT DATE"] = report_date #pd.to_datetime(report_date, format="%d/%m/%Y")

        else:
            ex = Exception("Unknown pdf version: {pv}".format(pv=pdf_version))
            print("Error - {ex}".format(ex=ex))
            rv = False
            df_res = ex
    except Exception as ex:
        print("refactor_region_df failed - {ex}".format(ex=ex))
        rv = False
        df_res = ex
    print("refactor_region_df ({rv}) <<".format(rv=rv))
    return (rv, df_res)



In [69]:
#----------------------------------------------------------------
# Download the new dataset and append the new data in the csv
# files.
#----------------------------------------------------------------

pdf_file_name = "dpc-covid19-ita-scheda-regioni-20200710.pdf"

# Pdf format change since: 2020-06-25.
#pdf_file_name = "dpc-covid19-ita-scheda-regioni-20200624.pdf"

pdf_file = os.path.join(temp_content_dir, pdf_file_name)

url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

df_regions = None

rv = False
result = get_web_file(url_region_pdf)
if result[0] == True:
    rv = save_content_to_file(pdf_file, result[1])
else:
    assert False, "File download failure."
    
report_date = None    
if rv == True:
    rv, df, report_date = pdf_to_dataframe(pdf_file)
if rv == True:
    df_regions = df
else:
    assert False, "Unable to transform pdf to dataframe."

assert report_date is not None, "Unable to read data sample date."

rv, df_regions = refactor_region_df(df_regions, report_date)
assert rv, "Cannot refactor the dataframe."

df_regions.sort_values(by=['REPORT DATE'], inplace=True)    
if os.path.isfile(csv_data_file) == False:
    df_regions.to_csv(csv_data_file, header = True, index=False)
else:     
    df_regions.to_csv(csv_data_file, mode='a', header = False, index=False)
    
    

get_web_file >>
Url: https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-20200710.pdf
get_web_file (True) <<
pdf_to_dataframe (/tmp/dpc-covid19-ita-scheda-regioni-20200710.pdf) >>
pdf_to_dataframe (rv=True - report_date=2020-07-10 00:00:00) <<
refactor_region_df (2020-07-10 00:00:00) >>
refactor_region_df (True) <<


In [70]:
df = pd.read_csv(csv_data_file, sep=',')
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

((3969, 16), '2020-04-24', '2020-10-29')

In [65]:
#----------------------------------------------------------------
# Download the OLD (version v2) dataset and append the new data 
# in the csv files.
#----------------------------------------------------------------

# Pdf format change since: 2020-06-25 (v2).
# Pdf format change since: 2020-06-06 (v3).
pdf_file_name = "dpc-covid19-ita-scheda-regioni-20200710.pdf"

pdf_file = os.path.join(temp_content_dir, pdf_file_name)

url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

df_regions = None

rv = False
result = get_web_file(url_region_pdf)
if result[0] == True:
    rv = save_content_to_file(pdf_file, result[1])
else:
    assert False, "File download failure."

report_date = None    
if rv == True:
    rv, df, report_date = pdf_to_dataframe(pdf_file)
if rv == True:
    df_regions = df
else:
    assert False, "Unable to transform pdf to dataframe."

#report_date = None
assert report_date is not None, "Unable to read data sample date."

rv, df_regions = refactor_region_df(df_regions, report_date, "v3")
    
df_regions.sort_values(by=['REPORT DATE'], inplace=True)    
if os.path.isfile(csv_data_file) == False:
    df_regions.to_csv(csv_data_file, header = True, index=False)
else:     
    df_regions.to_csv(csv_data_file, mode='a', header = False, index=False)
    

get_web_file >>
Url: https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-20200710.pdf
get_web_file (True) <<
pdf_to_dataframe (/tmp/dpc-covid19-ita-scheda-regioni-20200710.pdf) >>
pdf_to_dataframe (rv=True - report_date=2020-07-10 00:00:00) <<
refactor_region_df (2020-07-10 00:00:00) >>
refactor_region_df (True) <<


(21, 16)

In [74]:
#----------------------------------------------------------------
# Data quality tests.
#----------------------------------------------------------------
df_test = pd.read_csv(csv_data_file, sep=',')

rep_date_count = df_test['REPORT DATE'].value_counts().to_frame()
rep_date_count.reset_index(inplace=True)
rep_date_count.sort_values(by=['index'], inplace=True)
#rep_date_count['REPORT DATE'] = rep_date_count['REPORT DATE'].astype(str)
#rep_date_count.loc[rep_date_count['REPORT DATE'] < 21]
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-09")].shape[0] == 30, "Ottobre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-09")].shape[0] == 30, "Settembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-08")].shape[0] == 31, "Agosto"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-07")].shape[0] == 31, "Luglio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-06")].shape[0] == 30, "Giugno"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-05")].shape[0] == 31, "Maggio"
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-04")].shape[0] == 30, "Aprile"

In [76]:
rep_date_count.loc[rep_date_count['index'].str.contains("2020-04")]

Unnamed: 0,index,REPORT DATE
122,2020-04-24,21
146,2020-04-25,21
157,2020-04-26,21
43,2020-04-27,21
92,2020-04-28,21
71,2020-04-29,21
118,2020-04-30,21
