In [1]:
import os
import re
import sys
import json
import codecs
import locale
import logging
import requests
import datetime as dt
from typing import Union, Optional, Tuple, List, cast

import tabula
from tabula import read_pdf

import pandas as pd
import numpy as np


In [2]:
#----------------------------------------------------------------
# Configurations section
#----------------------------------------------------------------

# Url of the pdf file to download:
#url_region_pdf = "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/schede-riepilogative/regioni/dpc-covid19-ita-scheda-regioni-latest.pdf"

#----------------------------------------------------------------
#
#----------------------------------------------------------------
temp_content_dir = os.path.join(os.sep, 'tmp')

now = dt.datetime.now()
sample_date = now.strftime("%d/%m/%Y")

locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')
ok_statuses = [200, 201, 202]
data_file_path = os.path.join("..","data")
tmp_data_file_path = os.path.join(os.sep,"tmp")

csv_data_file = os.path.join(data_file_path, "reduced_report_data.csv")

pdf_file_name = os.path.join(os.sep, "tmp", "temp_data_file.pdf")
it_data_file = os.path.join(data_file_path, "virus-it.csv")
it_tmp_data_file = os.path.join(data_file_path, "virus-it-{dt}.csv".format(dt=now.strftime("%Y%m%d")))

lomb_data_file = os.path.join(data_file_path, "virus-lombardia.csv")
lomb_tmp_data_file = os.path.join(data_file_path, "virus-lombardia-{dt}.csv".format(dt=now.strftime("%Y%m%d")))



In [3]:
#----------------------------------------------------------------
# Import the procedures that download and transform the pdf file
# into a dataframe.
#----------------------------------------------------------------

module_path = os.path.abspath(os.path.join('..','src'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from DataDownloader import create_dataframe
from DataDownloader import get_web_file
from DataDownloader import save_content_to_file
from DataDownloader import pdf_to_dataframe
from DataDownloader import translate_to_date
from DataDownloader import refactor_region_df
from DataDownloader import save_df_to_csv
from DataDownloader import init_logger

from DataDownloader import load_date_range_reports



/home/giovanni/code-personal/python/CoronaVirus/src


In [5]:
print(csv_data_file)
df = pd.read_csv(csv_data_file, sep=',')
(df.shape, df["REPORT DATE"].min(), df["REPORT DATE"].max())

../data/reduced_report_data.csv


((6783, 10), '2020-05-01', '2021-03-20')

In [6]:
#----------------------------------------------------------------
# Data quality tests.
#----------------------------------------------------------------
print(csv_data_file)
df_test = pd.read_csv(csv_data_file, sep=',')

rep_date_count = df_test['REPORT DATE'].value_counts().to_frame()
rep_date_count.reset_index(inplace=True)
rep_date_count.sort_values(by=['index'], inplace=True)
#rep_date_count['REPORT DATE'] = rep_date_count['REPORT DATE'].astype(str)
#rep_date_count.loc[rep_date_count['REPORT DATE'] < 21]
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2021-03")].shape[0] == 31, "Marzo"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2021-02")].shape[0] == 28, "Febbraio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2021-01")].shape[0] == 31, "Gennaio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-12")].shape[0] == 31, "Dicembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-11")].shape[0] == 30, "Novembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-10")].shape[0] == 31, "Ottobre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-09")].shape[0] == 30, "Settembre"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-08")].shape[0] == 31, "Agosto"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-07")].shape[0] == 31, "Luglio"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-06")].shape[0] == 30, "Giugno"
assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-05")].shape[0] == 31, "Maggio"
#assert rep_date_count.loc[rep_date_count['index'].str.contains("2020-04")].shape[0] == 30, "Aprile"



../data/reduced_report_data.csv


In [8]:
# Check that all the days has 21 rows, one for each region.
df_test = pd.read_csv(csv_data_file, sep=',')
df_test.sort_values(by=["REPORT DATE"], inplace=True)
date_list = df_test["REPORT DATE"].unique()
for current_dt in date_list:
    print(current_dt)
    mask = (df_test["REPORT DATE"] == current_dt)
    assert df_test.loc[mask].shape[0] == 21, "Wrong date {d}".format(d=current_dt)

2020-05-01
2020-05-02
2020-05-03
2020-05-04
2020-05-05
2020-05-06
2020-05-07
2020-05-08
2020-05-09
2020-05-10
2020-05-11
2020-05-12
2020-05-13
2020-05-14
2020-05-15
2020-05-16
2020-05-17
2020-05-18
2020-05-19
2020-05-20
2020-05-21
2020-05-22
2020-05-23
2020-05-24
2020-05-25
2020-05-26
2020-05-27
2020-05-28
2020-05-29
2020-05-30
2020-05-31
2020-06-01
2020-06-02
2020-06-03
2020-06-04
2020-06-05
2020-06-06
2020-06-07
2020-06-08
2020-06-09
2020-06-10
2020-06-11
2020-06-12
2020-06-13
2020-06-14
2020-06-15
2020-06-16
2020-06-17
2020-06-18
2020-06-19
2020-06-20
2020-06-21
2020-06-22
2020-06-23
2020-06-24
2020-06-25
2020-06-26
2020-06-27
2020-06-28
2020-06-29
2020-06-30
2020-07-01
2020-07-02
2020-07-03
2020-07-04
2020-07-05
2020-07-06
2020-07-07
2020-07-08
2020-07-09
2020-07-10
2020-07-11
2020-07-12
2020-07-13
2020-07-14
2020-07-15
2020-07-16
2020-07-17
2020-07-18
2020-07-19
2020-07-20
2020-07-21
2020-07-22
2020-07-23
2020-07-24
2020-07-25
2020-07-26
2020-07-27
2020-07-28
2020-07-29
2020-07-30