# Deaths Italian Dataset Ingestion

In [1]:
import numpy as np
import pandas as pd
import requests
import zipfile
import os
import os.path

import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import locale

locale_ingest_str = 'es_ES.UTF-8'
locale.setlocale(locale.LC_ALL, locale_ingest_str)

date_format_ingest_raw = '%m%d'
date_format_ingest_std = '%Y-%m-%d'

## Dataset Description

Número de muertes en Italia, desagregados por sexo, comune y edad.

## Aggregated Raw Dataset Download

The dataset can be found ["here"](https://www.istat.it/it/files//2020/03/dati-giornalieri-comune-16aprile.zip).

In [2]:
# Download ZIP Raw Dataset file
url = 'https://www.istat.it/it/files//2020/03/dati-giornalieri-comune-16aprile.zip'
ingest_raw_path = '../../data/raw'
zip_file_name = 'DeathsIT.zip'

req = requests.get(url, allow_redirects = True)
f = open(os.path.join(ingest_raw_path, zip_file_name), 'wb')
f.write(req.content)
f.close()

# TODO: Check for file not found and empty file 

In [3]:
# Unzip file
zip_file = zipfile.ZipFile(os.path.join(ingest_raw_path, zip_file_name), "r")
try:
    zip_file.extractall(path = ingest_raw_path)
except:
    print('Error unzipping file {0}'.format(zip_file_name))
zip_file.close()

# Remove Zip file
os.remove(os.path.join(ingest_raw_path, zip_file_name))

In [4]:
# Raw Datased load and show
df_deaths = pd.read_csv('../../data/raw/dati-giornalieri-comune/comune_giorno.csv', dtype = 'str', encoding = 'ISO-8859-1', sep = ',', quotechar = "\"", na_filter = False, low_memory = False)

In [5]:
set(df_deaths['GE'].unique())
set(df_deaths['DATA_INIZIO_DIFF'].unique())

{'1 aprile', '16 aprile', '8 aprile', 'Dati 2020 n.d.'}

In [6]:
print(len(df_deaths))
print(len(df_deaths[df_deaths['DATA_INIZIO_DIFF'] == '1 aprile']))
print(len(df_deaths[df_deaths['DATA_INIZIO_DIFF'] == '8 aprile']))
print(len(df_deaths[df_deaths['DATA_INIZIO_DIFF'] == '16 aprile']))
print(len(df_deaths[df_deaths['DATA_INIZIO_DIFF'] == 'Dati 2020 n.d.']))

849120
195361
61733
34413
557613


In [7]:
df_deaths

Unnamed: 0,REG,PROV,NOME_REGIONE,NOME_PROVINCIA,NOME_COMUNE,COD_PROVCOM,DATA_INIZIO_DIFF,CL_ETA,GE,MASCHI_15,...,FEMMINE_17,FEMMINE_18,FEMMINE_19,FEMMINE_20,TOTALE_15,TOTALE_16,TOTALE_17,TOTALE_18,TOTALE_19,TOTALE_20
0,01,001,Piemonte,Torino,Agliè,001001,1 aprile,17,0102,0,...,0,1,0,0,0,0,0,1,0,0
1,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0104,0,...,0,0,0,0,0,1,0,0,0,0
2,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0105,0,...,0,0,0,0,0,0,0,0,0,1
3,01,001,Piemonte,Torino,Agliè,001001,1 aprile,17,0106,1,...,0,0,0,0,1,0,0,0,0,0
4,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0106,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,16,0411,0,...,0,0,0,9999,0,1,0,0,0,9999
849116,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,17,0411,0,...,0,0,0,9999,1,0,0,0,0,9999
849117,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,18,0411,1,...,0,0,0,9999,1,0,0,0,0,9999
849118,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,13,0413,0,...,0,0,0,9999,1,0,0,0,0,9999


## Standard Datasets Generation

### Raw Dataset Validation

Remove invalid bottom rows due to dataset comments. 

In [8]:
# Detete from dataset rows with empty values in key columns (1st or 2nd columns).
## df_deaths = df_deaths.mask(df_deaths.eq('')).dropna(axis = 0, how = 'any', subset = ['region_str', 'date_str'], inplace = False)

In [9]:
# Replace NaN values with 0
## df_covid19_agg = df_covid19_agg.fillna(0, axis = 0, inplace = False)

In [10]:
df_deaths

Unnamed: 0,REG,PROV,NOME_REGIONE,NOME_PROVINCIA,NOME_COMUNE,COD_PROVCOM,DATA_INIZIO_DIFF,CL_ETA,GE,MASCHI_15,...,FEMMINE_17,FEMMINE_18,FEMMINE_19,FEMMINE_20,TOTALE_15,TOTALE_16,TOTALE_17,TOTALE_18,TOTALE_19,TOTALE_20
0,01,001,Piemonte,Torino,Agliè,001001,1 aprile,17,0102,0,...,0,1,0,0,0,0,0,1,0,0
1,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0104,0,...,0,0,0,0,0,1,0,0,0,0
2,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0105,0,...,0,0,0,0,0,0,0,0,0,1
3,01,001,Piemonte,Torino,Agliè,001001,1 aprile,17,0106,1,...,0,0,0,0,1,0,0,0,0,0
4,01,001,Piemonte,Torino,Agliè,001001,1 aprile,18,0106,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,16,0411,0,...,0,0,0,9999,0,1,0,0,0,9999
849116,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,17,0411,0,...,0,0,0,9999,1,0,0,0,0,9999
849117,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,18,0411,1,...,0,0,0,9999,1,0,0,0,0,9999
849118,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,13,0413,0,...,0,0,0,9999,1,0,0,0,0,9999


In [11]:
# Defining data conversion and extraction functions

# Data conversion function
def data_convert_date(value_str, date_format_origin, date_format_target):
    try:
        return pd.to_datetime(str("2020" + value_str), format = "%Y" + date_format_origin).strftime(date_format_target)
    except:
        return None
    
# Total conversion function
def data_convert_total(value_str):
    if value_str == '':
        return "0"
    try:
        return str(int(str(value_str)))
    except:
        return None
    
# Age conversion function
def data_convert_age(value_str):
    if value_str == '':
        return None
    try:
        val = int(value_str)
        if val <= 1:
            val_int = val
        else:
            val_int = int((val-1)*5)
        return str(val_int)
    except:
        return None
    return None

In [12]:
# Applying conversion and extraction functions to dataset.

df_deaths['GE'] = df_deaths['GE'].apply(lambda x: data_convert_date(x, date_format_ingest_raw, date_format_ingest_std))
df_deaths['CL_ETA'] = df_deaths['CL_ETA'].apply(lambda x: data_convert_age(x))

df_deaths['MASCHI_15'] = df_deaths['MASCHI_15'].apply(lambda x: data_convert_total(x))
df_deaths['MASCHI_16'] = df_deaths['MASCHI_16'].apply(lambda x: data_convert_total(x))
df_deaths['MASCHI_17'] = df_deaths['MASCHI_17'].apply(lambda x: data_convert_total(x))
df_deaths['MASCHI_18'] = df_deaths['MASCHI_18'].apply(lambda x: data_convert_total(x))
df_deaths['MASCHI_19'] = df_deaths['MASCHI_19'].apply(lambda x: data_convert_total(x))
df_deaths['MASCHI_20'] = df_deaths['MASCHI_20'].apply(lambda x: data_convert_total(x))

df_deaths['FEMMINE_15'] = df_deaths['FEMMINE_15'].apply(lambda x: data_convert_total(x))
df_deaths['FEMMINE_16'] = df_deaths['FEMMINE_16'].apply(lambda x: data_convert_total(x))
df_deaths['FEMMINE_17'] = df_deaths['FEMMINE_17'].apply(lambda x: data_convert_total(x))
df_deaths['FEMMINE_18'] = df_deaths['FEMMINE_18'].apply(lambda x: data_convert_total(x))
df_deaths['FEMMINE_19'] = df_deaths['FEMMINE_19'].apply(lambda x: data_convert_total(x))
df_deaths['FEMMINE_20'] = df_deaths['FEMMINE_20'].apply(lambda x: data_convert_total(x))

In [13]:
df_deaths

Unnamed: 0,REG,PROV,NOME_REGIONE,NOME_PROVINCIA,NOME_COMUNE,COD_PROVCOM,DATA_INIZIO_DIFF,CL_ETA,GE,MASCHI_15,...,FEMMINE_17,FEMMINE_18,FEMMINE_19,FEMMINE_20,TOTALE_15,TOTALE_16,TOTALE_17,TOTALE_18,TOTALE_19,TOTALE_20
0,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-02,0,...,0,1,0,0,0,0,0,1,0,0
1,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-04,0,...,0,0,0,0,0,1,0,0,0,0
2,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-05,0,...,0,0,0,0,0,0,0,0,0,1
3,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-06,1,...,0,0,0,0,1,0,0,0,0,0
4,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-06,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,75,2020-04-11,0,...,0,0,0,9999,0,1,0,0,0,9999
849116,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,80,2020-04-11,0,...,0,0,0,9999,1,0,0,0,0,9999
849117,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,85,2020-04-11,1,...,0,0,0,9999,1,0,0,0,0,9999
849118,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,60,2020-04-13,0,...,0,0,0,9999,1,0,0,0,0,9999


In [14]:
# # None values mark rows with standarization problems.
# # Check for None values in all columns.

# # Erros on province column.
# prov_ko_count = len(df_covid19_agg[df_covid19_agg['region'] == None])
# if prov_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'region_str\' column.'.format(sex_ko_count))

# # Erros on date column.
# date_ko_count = len(df_covid19_agg[df_covid19_agg['date'] == None])
# if date_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'date_str\' column.'.format(date_ko_count))
    
# # Erros on num_cases column.
# num_cases_ko_count = len(df_covid19_agg[df_covid19_agg['num_cases'] == None])
# if num_cases_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'num_cases_str\' column.'.format(num_cases_ko_count))

# # Erros on num_hosp column.
# num_hosp_ko_count = len(df_covid19_agg[df_covid19_agg['num_hosp'] == None])
# if num_hosp_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'num_hosp_str\' column.'.format(num_hosp_ko_count))

# # Erros on num_icu column.
# num_icu_ko_count = len(df_covid19_agg[df_covid19_agg['num_icu'] == None])
# if num_icu_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'num_icu_str\' column.'.format(num_icu_ko_count))
    
# # Erros on num_deaths column.
# num_deaths_ko_count = len(df_covid19_agg[df_covid19_agg['num_deaths'] == None])
# if num_deaths_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'num_deaths_str\' column.'.format(num_deaths_ko_count))
    
# # Erros on num_recov column.
# num_recov_ko_count = len(df_covid19_agg[df_covid19_agg['num_recov'] == None])
# if num_recov_ko_count != 0: 
#     sys.exit('Found {0} rows with incorrect values on \'num_recov_str\' column.'.format(num_recov_ko_count))

### Conversion to Standard Datasets

In [15]:
# Applying type conversion functions to dataset.

df_deaths['id_regione'] = df_deaths['REG'].astype(str)
df_deaths['regione'] = df_deaths['NOME_REGIONE'].astype(str)
df_deaths['id_provincia'] = df_deaths['PROV'].astype(str)
df_deaths['provincia'] = df_deaths['NOME_PROVINCIA'].astype(str)
df_deaths['id_comune'] = df_deaths['COD_PROVCOM'].astype(str)
df_deaths['comune'] = df_deaths['NOME_COMUNE'].astype(str)
df_deaths['data_inizio_diff'] = df_deaths['DATA_INIZIO_DIFF'].astype(str)
df_deaths['eta'] = df_deaths['CL_ETA'].apply(int)
df_deaths['date'] = df_deaths['GE'].apply(lambda x: pd.to_datetime(str(x), format = date_format_ingest_std))

df_deaths['maschi_2015'] = df_deaths['MASCHI_15'].apply(int)
df_deaths['maschi_2016'] = df_deaths['MASCHI_16'].apply(int)
df_deaths['maschi_2017'] = df_deaths['MASCHI_17'].apply(int)
df_deaths['maschi_2018'] = df_deaths['MASCHI_18'].apply(int)
df_deaths['maschi_2019'] = df_deaths['MASCHI_19'].apply(int)
df_deaths['maschi_2020'] = df_deaths['MASCHI_20'].apply(int)

df_deaths['femmine_2015'] = df_deaths['FEMMINE_15'].apply(int)
df_deaths['femmine_2016'] = df_deaths['FEMMINE_16'].apply(int)
df_deaths['femmine_2017'] = df_deaths['FEMMINE_17'].apply(int)
df_deaths['femmine_2018'] = df_deaths['FEMMINE_18'].apply(int)
df_deaths['femmine_2019'] = df_deaths['FEMMINE_19'].apply(int)
df_deaths['femmine_2020'] = df_deaths['FEMMINE_20'].apply(int)

In [16]:
df_deaths

Unnamed: 0,REG,PROV,NOME_REGIONE,NOME_PROVINCIA,NOME_COMUNE,COD_PROVCOM,DATA_INIZIO_DIFF,CL_ETA,GE,MASCHI_15,...,maschi_2017,maschi_2018,maschi_2019,maschi_2020,femmine_2015,femmine_2016,femmine_2017,femmine_2018,femmine_2019,femmine_2020
0,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-02,0,...,0,0,0,0,0,0,0,1,0,0
1,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-04,0,...,0,0,0,0,0,1,0,0,0,0
2,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-05,0,...,0,0,0,1,0,0,0,0,0,0
3,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-06,1,...,0,0,0,0,0,0,0,0,0,0
4,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-06,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,75,2020-04-11,0,...,0,0,0,9999,0,0,0,0,0,9999
849116,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,80,2020-04-11,0,...,0,0,0,9999,1,0,0,0,0,9999
849117,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,85,2020-04-11,1,...,0,0,0,9999,0,0,0,0,0,9999
849118,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,60,2020-04-13,0,...,0,0,0,9999,1,0,0,0,0,9999


In [17]:
# Split dataset in zone dataset and data dataset

# Zone dataset
df_zones = df_deaths[['id_regione', 'regione', 'id_provincia', 'provincia', 'id_comune', 'comune']].drop_duplicates()
df_zones

Unnamed: 0,id_regione,regione,id_provincia,provincia,id_comune,comune
0,01,Piemonte,001,Torino,001001,Agliè
86,01,Piemonte,001,Torino,001002,Airasca
143,01,Piemonte,001,Torino,001003,Ala di Stura
155,01,Piemonte,001,Torino,001004,Albiano d'Ivrea
207,01,Piemonte,001,Torino,001006,Almese
...,...,...,...,...,...,...
848768,20,Sardegna,111,Sud Sardegna,111103,Villaputzu
848867,20,Sardegna,111,Sud Sardegna,111104,Villasalto
848902,20,Sardegna,111,Sud Sardegna,111105,Villasimius
848962,20,Sardegna,111,Sud Sardegna,111106,Villasor


In [18]:
# Saving standard zones with 
df_zones.to_csv('../../data/standard/ZonesIT.csv', index = False)

In [19]:
df_deaths

Unnamed: 0,REG,PROV,NOME_REGIONE,NOME_PROVINCIA,NOME_COMUNE,COD_PROVCOM,DATA_INIZIO_DIFF,CL_ETA,GE,MASCHI_15,...,maschi_2017,maschi_2018,maschi_2019,maschi_2020,femmine_2015,femmine_2016,femmine_2017,femmine_2018,femmine_2019,femmine_2020
0,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-02,0,...,0,0,0,0,0,0,0,1,0,0
1,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-04,0,...,0,0,0,0,0,1,0,0,0,0
2,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-05,0,...,0,0,0,1,0,0,0,0,0,0
3,01,001,Piemonte,Torino,Agliè,001001,1 aprile,80,2020-01-06,1,...,0,0,0,0,0,0,0,0,0,0
4,01,001,Piemonte,Torino,Agliè,001001,1 aprile,85,2020-01-06,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,75,2020-04-11,0,...,0,0,0,9999,0,0,0,0,0,9999
849116,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,80,2020-04-11,0,...,0,0,0,9999,1,0,0,0,0,9999
849117,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,85,2020-04-11,1,...,0,0,0,9999,0,0,0,0,0,9999
849118,20,111,Sardegna,Sud Sardegna,Villaspeciosa,111107,Dati 2020 n.d.,60,2020-04-13,0,...,0,0,0,9999,1,0,0,0,0,9999


In [20]:
# Remove raw columns and zone columns

columns_to_delete = ['id_regione', 'regione', 'id_provincia', 'provincia', 'comune']
columns_to_delete.extend([item for item in df_deaths.columns if item.upper() == item])
columns_to_delete

['id_regione',
 'regione',
 'id_provincia',
 'provincia',
 'comune',
 'REG',
 'PROV',
 'NOME_REGIONE',
 'NOME_PROVINCIA',
 'NOME_COMUNE',
 'COD_PROVCOM',
 'DATA_INIZIO_DIFF',
 'CL_ETA',
 'GE',
 'MASCHI_15',
 'MASCHI_16',
 'MASCHI_17',
 'MASCHI_18',
 'MASCHI_19',
 'MASCHI_20',
 'FEMMINE_15',
 'FEMMINE_16',
 'FEMMINE_17',
 'FEMMINE_18',
 'FEMMINE_19',
 'FEMMINE_20',
 'TOTALE_15',
 'TOTALE_16',
 'TOTALE_17',
 'TOTALE_18',
 'TOTALE_19',
 'TOTALE_20']

In [21]:
df_deaths.drop(columns_to_delete, axis = 1, inplace = True)

In [22]:
df_deaths

Unnamed: 0,id_comune,data_inizio_diff,eta,date,maschi_2015,maschi_2016,maschi_2017,maschi_2018,maschi_2019,maschi_2020,femmine_2015,femmine_2016,femmine_2017,femmine_2018,femmine_2019,femmine_2020
0,001001,1 aprile,80,2020-01-02,0,0,0,0,0,0,0,0,0,1,0,0
1,001001,1 aprile,85,2020-01-04,0,0,0,0,0,0,0,1,0,0,0,0
2,001001,1 aprile,85,2020-01-05,0,0,0,0,0,1,0,0,0,0,0,0
3,001001,1 aprile,80,2020-01-06,1,0,0,0,0,0,0,0,0,0,0,0
4,001001,1 aprile,85,2020-01-06,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849115,111107,Dati 2020 n.d.,75,2020-04-11,0,1,0,0,0,9999,0,0,0,0,0,9999
849116,111107,Dati 2020 n.d.,80,2020-04-11,0,0,0,0,0,9999,1,0,0,0,0,9999
849117,111107,Dati 2020 n.d.,85,2020-04-11,1,0,0,0,0,9999,0,0,0,0,0,9999
849118,111107,Dati 2020 n.d.,60,2020-04-13,0,0,0,0,0,9999,1,0,0,0,0,9999


In [23]:
# Order dataset 
df_deaths.sort_values(list(df_deaths.columns[: 4]), inplace = True)

In [24]:
# Saving standard dataset with aggregated values
df_deaths.to_csv('../../data/standard/DeathsIT.csv', index = False)

### Analisis gaps

In [25]:
df_deaths[(df_deaths['maschi_2020'] == 9999) | (df_deaths['femmine_2020'] == 9999)][['data_inizio_diff', 'date']] 

Unnamed: 0,data_inizio_diff,date
85,1 aprile,2020-04-26
83,1 aprile,2020-04-10
79,1 aprile,2020-04-06
81,1 aprile,2020-04-08
80,1 aprile,2020-04-06
...,...,...
849100,Dati 2020 n.d.,2020-03-02
849111,Dati 2020 n.d.,2020-03-26
849094,Dati 2020 n.d.,2020-02-15
849097,Dati 2020 n.d.,2020-02-21


In [26]:
print("TOTAL COMUNES: {0}".format(len(df_deaths['id_comune'].unique())))
print(" - TOTAL comunes sin datos: {0}".format(len(df_deaths[df_deaths['data_inizio_diff'] == 'Dati 2020 n.d.']['id_comune'].unique())))
print(" - TOTAL comunes con datos: {0}".format(len(df_deaths[df_deaths['data_inizio_diff'] != 'Dati 2020 n.d.']['id_comune'].unique())))
print("   - comunes con datos desde el 1 de abril: {0}".format(len(df_deaths[df_deaths['data_inizio_diff'] == '1 aprile']['id_comune'].unique())))
print("   - comunes con datos desde el 8 de abril: {0}".format(len(df_deaths[df_deaths['data_inizio_diff'] == '8 aprile']['id_comune'].unique())))
print("   - comunes con datos desde el 16 de abril: {0}".format(len(df_deaths[df_deaths['data_inizio_diff'] == '16 aprile']['id_comune'].unique())))

TOTAL COMUNES: 7899
 - TOTAL comunes sin datos: 6210
 - TOTAL comunes con datos: 1689
   - comunes con datos desde el 1 de abril: 1084
   - comunes con datos desde el 8 de abril: 366
   - comunes con datos desde el 16 de abril: 239


### Filling Gaps on TimeSeries 1

In [None]:
all_columns = list(df_deaths.columns)
all_columns

In [None]:
fixed_columns = list(df_deaths.columns[: 2])
fixed_columns

In [None]:
df_deaths.insert(loc = 0, column = 'zona', value = df_deaths[fixed_columns].apply(lambda x: "||".join([str(a) for a in x.dropna().values.tolist()]), axis = 1))

In [None]:
df_deaths

In [None]:
df_deaths.drop(columns = fixed_columns, axis = 1, inplace = True)
df_deaths

In [None]:
index_columns = list(df_deaths.columns[: 3])
index_columns

In [None]:
df_deaths.set_index(index_columns, inplace = True)
df_deaths

In [None]:
var_columns = df_deaths.columns
var_columns

In [None]:
new_index = pd.MultiIndex.from_product(df_deaths.index.levels)

In [None]:
df_deaths.index.duplicated()

In [None]:
[i for i, x in enumerate(df_deaths.index.duplicated()) if x]

In [None]:
df_deaths.iloc[ 121840-3:121840+3 , : ]

In [None]:
df_deaths = df_deaths.reindex(new_index)

In [None]:
df_deaths

In [None]:
df_deaths.reset_index(inplace = True)
df_deaths

In [None]:
df_deaths[[fixed_columns]] = df_deaths['zona'].str.split('||', expand = True)

In [None]:
df_deaths

In [None]:
# df_deaths = df_deaths[all_columns]
df_deaths[all_columns]

In [None]:
#df_deaths = df_deaths[var_columns + [item for item in df_deaths.columns if item not in var_columns]]
[item for item in df_deaths.columns.tolist() if item not in var_columns.tolist()]
#df_deaths.columns.tolist()
print(var_columns.tolist())
print(df_deaths.columns.tolist())

In [None]:
# Order dataset 
#df_deaths.sort_values(list(df_deaths.columns[: 6]), inplace = True)

In [None]:
# Saving standard dataset with aggregated values
df_deaths.to_csv('../../data/standard/DeathsIT1.csv', index = False)

### DMM

In [None]:
df_deaths = pd.read_csv('../../data/standard/DeathsIT1.csv', parse_dates = ['date'])

In [None]:
df_deaths.dtype

In [None]:
df_deaths.fillna(0, inplace = False).astype(int)
df_deaths

In [None]:
fixed_columns

In [None]:
df_deaths[[fixed_columns]] = df_deaths['zona'].str.split('||', expand = True)

In [None]:
df_deaths.dtype

In [None]:
df_deaths['eta'] = df_deaths['eta'].apply(int)

In [None]:
df_deaths.dtype

In [None]:
df4.insert(loc = 0, column = 'regione', value = df4['zona'].apply(lambda x: x.split('||')[0]))

In [None]:
df4.insert(loc = 1, column = 'provincia', value = df4['zona'].apply(lambda x: x.split('||')[1]))

In [None]:
df4.insert(loc = 1, column = 'provincia', value = df4['zona'].apply(lambda x: x.split('||')[1]))
df4.insert(loc = 2, column = 'comune', value = df4['zona'].apply(lambda x: x.split('||')[2]))
df4.insert(loc = 3, column = 'data_inizio_diff', value = df4['zona'].apply(lambda x: x.split('||')[3]))
df4.insert(loc = 4, column = 'eta', value = df4['zona'].apply(lambda x: int(x.split('||')[4])))
#df0.drop(['regione', 'provincia', 'comune', 'data_inizio_diff', 'eta'], axis = 1, inplace = True)

In [None]:
df4.drop(columns = ['zona'], axis = 1, inplace = True)

In [None]:
df4

In [None]:
# Order dataset by 'region' and 'date'
df4.sort_values(list(df_deaths.columns[: 6]), inplace = True)

# Saving standard dataset with aggregated values
df4to_csv('../../data/standard/DeathsIT2.csv', index = False)