In [34]:
import os
import re
import sys
import json
import codecs
import locale
import logging
import requests
import datetime as dt
from typing import Union, Optional, Tuple, List, cast

import tabula
from tabula import read_pdf

import pandas as pd
import numpy as np


In [35]:
#----------------------------------------------------------------
# Configurations section
#----------------------------------------------------------------

# Url of the pdf file to download:
#url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-latest.pdf"

#----------------------------------------------------------------
#
#----------------------------------------------------------------
temp_content_dir = os.path.join(os.sep, 'tmp')

now = dt.datetime.now()
sample_date = now.strftime("%d/%m/%Y")

locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')
ok_statuses = [200, 201, 202]
data_file_path = os.path.join("..","data")
tmp_data_file_path = os.path.join(os.sep,"tmp")

csv_data_file = os.path.join(data_file_path, "reduced_repord_data.csv")

pdf_file_name = os.path.join(os.sep, "tmp", "temp_data_file.pdf")
it_data_file = os.path.join(data_file_path, "virus-it.csv")
it_tmp_data_file = os.path.join(data_file_path, "virus-it-{dt}.csv".format(dt=now.strftime("%Y%m%d")))

lomb_data_file = os.path.join(data_file_path, "virus-lombardia.csv")
lomb_tmp_data_file = os.path.join(data_file_path, "virus-lombardia-{dt}.csv".format(dt=now.strftime("%Y%m%d")))



In [36]:
#----------------------------------------------------------------
# Import the procedures that download and transform the pdf file
# into a dataframe.
#----------------------------------------------------------------

module_path = os.path.abspath(os.path.join('..','src'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from DataDownloader import create_dataframe
from DataDownloader import get_web_file
from DataDownloader import save_content_to_file
from DataDownloader import pdf_to_dataframe
from DataDownloader import translate_to_date
from DataDownloader import refactor_region_df
from DataDownloader import save_df_to_csv
from DataDownloader import init_logger

from DataDownloader import load_date_range_reports



/home/giovanni/code-personal/python/CoronaVirus/src


In [8]:
init_logger('/tmp', "virus-notebook.log",log_level=logging.DEBUG, std_out_log_level=logging.ERROR)

columns_report_charts = ["REPORT DATE","Regione"
                        ,"Ricoverati con sintomi","Terapia intensiva","Totale attualmente positivi"
                        ,"Isolamento domiciliare"
                        ,"CASI TOTALI - A"
                        ,"Totale tamponi effettuati"]
temp_content_dir = os.path.join(os.sep, 'tmp') 
rv, df = load_date_range_reports(dt.datetime.strptime("xx/12/2020",'%d/%m/%Y')
                                ,dt.datetime.strptime("xx/12/2020",'%d/%m/%Y')
                                ,{"temp_dir": tmp_data_file_path
                                ,"data file": csv_data_file
                                ,"columns":columns_report_charts
                                ,"save": True
                                ,"sort column": "REPORT DATE"})


In [37]:
print(csv_data_file)
df = pd.read_csv(csv_data_file, sep=',')
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

../data/reduced_repord_data.csv


((3885, 8), '2020-06-25', '2020-12-26')

In [42]:
df[(df["REPORT DATE"] == "2020-12-06") & (df["Regione"] == "Lombardia")].head(21)

Unnamed: 0,REPORT DATE,Regione,Ricoverati con sintomi,Terapia intensiva,Totale attualmente positivi,Isolamento domiciliare,CASI TOTALI - A,Totale tamponi effettuati
3452,2020-12-06,Lombardia,6372,807,116379,109200,429109,4279332


In [23]:
#----------------------------------------------------------------
# Data quality tests.
#----------------------------------------------------------------
print(csv_data_file)
df_test = pd.read_csv(csv_data_file, sep=',')

rep_date_count = df_test['REPORT DATE'].value_counts().to_frame()
rep_date_count.reset_index(inplace=True)
rep_date_count.sort_values(by=['index'], inplace=True)
#rep_date_count['REPORT DATE'] = rep_date_count['REPORT DATE'].astype(str)
#rep_date_count.loc[rep_date_count['REPORT DATE'] < 21]
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-12")].shape[0] == 31, "Dicembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-11")].shape[0] == 30, "Novembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-10")].shape[0] == 31, "Ottobre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-09")].shape[0] == 30, "Settembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-08")].shape[0] == 31, "Agosto"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-07")].shape[0] == 31, "Luglio"
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-06")].shape[0] == 30, "Giugno"
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-05")].shape[0] == 31, "Maggio"
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-04")].shape[0] == 30, "Aprile"

../data/reduced_repord_data.csv


In [33]:
print(csv_data_file)
df_test = pd.read_csv(csv_data_file, sep=',')
df_test.sort_values(by=["REPORT DATE"], inplace=True)
df_test.to_csv(csv_data_file, mode='w', header = True, index=False)

../data/reduced_repord_data.csv


In [25]:
df_test.shape

(3864, 8)