In [1]:
import os
import re
import json
import codecs
import locale
import requests
import datetime

import tabula
from tabula import read_pdf

import pandas as pd


In [2]:
#----------------------------------------------------------------
# Configurations section
#----------------------------------------------------------------

# Url of the pdf file to download:
url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-latest.pdf"

#----------------------------------------------------------------
#
#----------------------------------------------------------------

now = datetime.datetime.now()
sample_date = now.strftime("%d/%m/%Y")

locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')
ok_statuses = [200, 201, 202]
data_file_path = os.path.join("..","data")
#data_file_path = os.path.join(os.sep,"tmp")

csv_data_file = os.path.join(data_file_path, "italy-regions.csv")

pdf_file_name = os.path.join(os.sep, "tmp", "temp_data_file.pdf")
it_data_file = os.path.join(data_file_path, "virus-it.csv")
it_tmp_data_file = os.path.join(data_file_path, "virus-it-{dt}.csv".format(dt=now.strftime("%Y%m%d")))

lomb_data_file = os.path.join(data_file_path, "virus-lombardia.csv")
lomb_tmp_data_file = os.path.join(data_file_path, "virus-lombardia-{dt}.csv".format(dt=now.strftime("%Y%m%d")))



In [3]:
#----------------------------------------------------------------
#
#----------------------------------------------------------------

def get_web_file(url):
    """
    
    :param url: 
    :return (rv, content):
    """
    rv = False
    result = None
    try:
        result = requests.get(url_region_pdf)
        if result.status_code not in ok_statuses:
            print("Get data failed. Received error code: {er}".format(er=str(result.status_code)))
        else:
            result = result.content
    except Exception as ex:
        print("get_web_file failed - {ex}".format(ex=ex))
    else:
        rv = True
    return (rv, result)    
        
def save_content_to_file(file_name, content):
    """
    
    :param file_name: 
    :return rv:
    """
    rv = False
    try:
        with open(file_name, "wb") as fh:
            fh.write(content)
    except Exception as ex:
        print("save_content_to_file failed - {ex}".format(ex=ex))
    else:
        rv = True
    return rv
   
def pdf_to_dataframe(pdf_file_name):
    """
    
    :param pdf_file_name: 
    :return rv:
    """
    rv = False
    df = None
    report_date = None
    try:
        df = tabula.read_pdf(pdf_file_name, pages='all')
        #print("Df list len: {l}".format(l=len(df)))
        
        csv_file = os.path.splitext(pdf_file_name)[0] + ".csv"
        tabula.convert_into(pdf_file_name, csv_file, output_format="csv", pages='all')
        list_reg = [] 
        with open(csv_file, "r") as fh:
            start = False
            end = False
            reg = re.compile("(\d{1,3}) (\d)")
            for line in fh:
                if line.startswith("Lombardia") == True:
                    start = True
                if line.startswith("TOTALE") == True:
                    end = True
                    start = False
                if start == True:
                    line = line.replace(".", "")
                    line = line.replace("+ ", "")
                    #line = line.replace(" ", ",")
                    line = reg.sub("\\1,\\2", line)
                    line = line.replace("\n", "")
                    list_reg.append(line)
                if 'Aggiornamento casi Covid-19' in line:
                    parts = line.split(" - ")
                    if len(parts) > 1:
                        report_date = parts[0]
                        if parts[0][0] == "\"":
                            report_date = parts[0][1:]
                        #print(report_date)
                        report_date = translate_to_date(report_date.split(" "))
        
        df = pd.DataFrame([line.split(",") for line in list_reg])
        rv = True
        
    except Exception as ex:
        print("pdf_to_dataframe failed - {ex}".format(ex=ex))
    return (rv, df, report_date)

def translate_to_date(dt):
    #print("translate_to_date {p} >>".format(p=str(dt)))
    rv = False
    date = None
    months_names = {
        "gennaio":    1
        ,"febbraio":  2
        ,"marzo":     3
        ,"aprile":    4
        ,"maggio":    5
        ,"giugno":    6
        ,"luglio":    7
        ,"agosto":    8
        ,"settembre": 9
        ,"ottobre":  10
        ,"novembre": 11
        ,"dicembre": 12
    }
    if len(dt) >= 3 :
        try:
            day = dt[0]
            year = dt[2]
            month = months_names.get(dt[1].lower())
            if month is not None:
                #print("Dt: {d}/{m}/{y}".format(d=day,m=month,y=year))
                date = datetime.datetime(year=int(year), month=int(month), day=int(day))
                rv = True
            else:
                date = Exception("Unknown month: {m}".format(m=dt[1]))
        except Exception as ex:
            print("Exception - {e}".format(e=ex))
            date = ex
    else:
        date = Exception("Wrong format: {dt}".format(dt=str(dt)))
    #print("translate_to_date rv:{rv} - dt:{dt} <<".format(rv=rv,dt=str(date)))
    return (rv, date)
    
def refactor_region_df(df, report_date):
    """
    
    :param df: a list vor v2 or a dataframe for v1
    :return (rv, df_region):
    """
    rv = False
    df_res = None
    try:
        df_res = df
        df_res.rename(columns={df_res.columns[ 0]: "Regione"
                              ,df_res.columns[ 1]: "Ricoverati con sintomi"
                              ,df_res.columns[ 2]: "Terapia intensiva"
                              ,df_res.columns[ 3]: "Isolamento domiciliare"
                              ,df_res.columns[ 4]: "Totale attualmente positivi"
                              ,df_res.columns[ 5]: "DIMESSI/GUARITI"
                              ,df_res.columns[ 6]: "DECEDUTI"
                              ,df_res.columns[ 7]: "CASI TOTALI - A"
                              ,df_res.columns[ 8]: "INCREMENTO CASI TOTALI (rispetto al giorno precedente)"
                              ,df_res.columns[ 9]: "Casi identificatidal sospettodiagnostico"
                              ,df_res.columns[10]: "Casi identificatida attività discreening"
                              ,df_res.columns[11]: "CASI TOTALI - B"
                              ,df_res.columns[12]: "Totale casi testati"
                              ,df_res.columns[13]: "Totale tamponi effettuati"
                              ,df_res.columns[14]: "INCREMENTOTAMPONI" 
                          },
                      inplace = True)
        rv = True  
        df_res["REPORT DATE"] = report_date #pd.to_datetime(report_date, format="%d/%m/%Y")
        #df_res.set_index("Regione", inplace=True)

    except Exception as ex:
        print("refactor_region_df failed - {ex}".format(ex=ex))
    print("rv -> {rv}".format(rv=rv))
    return (rv, df_res)



In [8]:
#----------------------------------------------------------------
# Download the new dataset and append the new data in the csv
# files.
#
# Pay attention to the date: it is calculated as "now" so if the
# data aren't of today you shoud change the value by hand.
#----------------------------------------------------------------

pdf_file_name = "dpc-covid19-ita-scheda-regioni-202010xx.pdf"
url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/{fn}".format(fn=pdf_file_name)

df_regions = None

rv = False
result = get_web_file(url_region_pdf)
if result[0] == True:
    rv = save_content_to_file(pdf_file_name, result[1])
else:
    assert False, "File download failure."
    
if rv == True:
    df_list = pdf_to_dataframe(pdf_file_name)
if df_list[0] == True:
    df_regions = df_list[1]
else:
    assert False, "Unable to transform pdf to dataframe."

date = None
if df_list[2] is not None:
    date = df_list[2][1]
    print("Sample date: {dt}".format(dt=str(date)))
else:
    assert False, "Unable to read data sample date."
df_regions = refactor_region_df(df_regions, date)
if df_regions[0] == True:
    df_regions = df_regions[1]
    
df_regions.sort_values(by=['REPORT DATE'], inplace=True)    
if os.path.isfile(csv_data_file) == False:
    df_regions.to_csv(csv_data_file, header = True, index=False)
else:     
    df_regions.to_csv(csv_data_file, mode='a', header = False, index=False)
    
    

Sample date: 2020-10-26 00:00:00
rv -> True


In [11]:
df = pd.read_csv(csv_data_file, sep=',')
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

((2058, 16), '2020-07-21', '2020-10-26')

In [6]:
df

Unnamed: 0,Regione,Ricoverati con sintomi,Terapia intensiva,Isolamento domiciliare,Totale attualmente positivi,DIMESSI/GUARITI,DECEDUTI,CASI TOTALI - A,INCREMENTO CASI TOTALI (rispetto al giorno precedente),Casi identificatidal sospettodiagnostico,Casi identificatida attività discreening,CASI TOTALI - B,Totale casi testati,Totale tamponi effettuati,INCREMENTOTAMPONI,REPORT DATE
0,Lombardia,1521,134,28235,29890,87591,17123,134604,4125,113622,20982,134604,1611623,2575003,36416,2020-10-21
1,Piemonte,1111,74,12250,13435,30467,4216,48118,1799,26093,22025,48118,529281,895399,13611,2020-10-21
2,Emilia-Romagna,596,86,10118,10800,27257,4531,42588,671,34999,7589,42588,777142,1417038,17165,2020-10-21
3,Veneto,439,56,10938,11433,24550,2282,38265,1422,23338,14927,38265,849385,2178114,19627,2020-10-21
4,Campania,996,85,20020,21101,8838,545,30484,1760,28264,2220,30484,548083,802965,13878,2020-10-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,Veneto,585,71,16316,16972,25036,2329,44337,1468,24005,20332,44337,871568,2238353,12061,2020-10-25
2054,Emilia-Romagna,757,88,13983,14828,27345,4561,46734,1192,36934,9800,46734,804679,1476451,9644,2020-10-25
2055,Piemonte,1601,94,18182,19877,31399,4259,55535,2287,28556,26979,55535,566796,944133,12657,2020-10-25
2056,Puglia,548,57,7267,7872,6011,663,14546,515,4146,10400,14546,370561,522951,4377,2020-10-25
