In [4]:
#############################################
# Read data from github repo: https://github.com/pcm-dpc/COVID-19
# Dipartimento della Protezione Civile
#############################################
url = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv'
path = 'dpc-covid19-ita-andamento-nazionale.csv'
ita = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv'
regioni = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv'
prov = pd.read_csv(url)

#############################################
# Manipulate dataframes
#############################################
# Italia #################
ita = ita.rename(columns={
    'data': 'date',
    'totale_casi': 'cases',
    'deceduti' : 'deaths',
    'tamponi' : 'tests',
    'dimessi_guariti': 'recovered',
})
ita['region'] = 'Italia'
ita['date'] = ita['date'].map(lambda date: datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S'))

# Nord, Centro, Sud #################
nord = ['Lombardia', 'Emilia-Romagna', 'Friuli Venezia Giulia', 'Liguria', 'P.A. Bolzano', 'P.A. Trento', 'Piemonte', 'Valle d\'Aosta', 'Veneto']
centro = ['Abruzzo', 'Lazio', 'Marche', 'Molise', 'Toscana', 'Umbria']
sud = ['Basilicata', 'Calabria', 'Campania', 'Puglia', 'Sardegna', 'Sicilia']
def zone(region):
    if region in nord:
        return 'Nord'
    elif region in centro:
        return 'Centro'
    elif region in sud:
        return 'Sud'
    else: print('Error: No zone assigned to region!')


# Regioni #################
regioni = regioni.rename(columns={
    'data': 'date',
    'totale_casi': 'cases',
    'deceduti' : 'deaths',
    'denominazione_regione' : 'region',
    'tamponi' : 'tests',
    'dimessi_guariti': 'recovered'
})
regioni['date'] = regioni['date'].map(lambda date: datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S'))

regioni['zone'] = regioni['region'].map(lambda location: zone(location))
regioni = regioni.sort_values(['region', 'date'])

# Aree (Nord, Centro, Sud) #################
aree = regioni.groupby(['zone', 'date'],as_index=False).sum()
aree['region'] = aree['zone']

# merge dataframes #################
data = ita.append([aree, regioni], ignore_index=True)

# Dati popolazione ISTAT 01/01/2019
population = {
    'Lombardia': 10.04e6,
    'Emilia-Romagna': 4.45e6,
    'Veneto': 4.91e6,
    'Piemonte': 4.38e6,
    'Marche': 1.53e6,
    'Toscana': 3.74e6,
    'Liguria': 1.56e6,
    'Lazio': 5.90e6,
    'Campania': 5.83e6,
    'Friuli Venezia Giulia': 1.22e6,
    'P.A. Trento': 541e3,
    'P.A. Bolzano': 531e3,
    'Puglia': 4.05e6,
    'Abruzzo': 1.32e6,
    'Sicilia': 4999891,
    'Umbria': 885e3,
    'Sardegna': 1.65e6,
    'Valle d\'Aosta': 126e3,
    'Calabria': 1.96e6,
    'Basilicata': 567e3,
    'Molise': 308e3,
    'Nord': 27758000,
    'Centro': 13683000,
    'Sud': 19056891,
    'Italia': 60497891
}
# add population information
for region in population.keys():
    data.at[data.region==region, 'population'] = population[region]
data['population'] = data['population'].replace(np.nan, 9999999999)
data = data.replace(np.nan,0.0000000001)

# add differential data
for col in ['cases', 'recovered', 'deaths','tests','terapia_intensiva','totale_positivi','ricoverati_con_sintomi']:
    data['new_'+col] = data[col].diff()

data['new_terapia_intensiva_netta_max'] = data['new_terapia_intensiva'] + data['new_deaths'] + data['new_recovered']*data['terapia_intensiva'].shift(1)/data['ricoverati_con_sintomi'].shift(1)    
data['new_terapia_intensiva_netta'] = (data['new_terapia_intensiva'] + data['new_deaths']).clip(lower=0)
data['new_terapia_intensiva_netta_min'] = (data['new_terapia_intensiva'] + data['new_deaths']*0.5).clip(lower=0)

# add data per 100k inhabitants
for var in ['cases','new_cases','deaths','new_deaths','recovered','new_recovered','tests','new_tests','terapia_intensiva','new_terapia_intensiva','ricoverati_con_sintomi','new_ricoverati_con_sintomi']:
    data[var+'_per100k'] = data.apply(lambda row: row[var]/row.population*1e5, axis = 1)
    
# add tassi d'incremento 
def delta(data, variables):
    '''Tasso di crescita mediato sugli ultimi 1,2,3,4 giorni'''
    for var in variables: 
        for dayspan in [1,2,3,4]:
            colname = var+'_delta'+str(dayspan)
            data[colname] = 0
            for i in np.arange(dayspan):
                data[colname] = data[colname] + data['new_'+var].shift(i)/data[var].shift(i+1)
            data[colname] = data[colname]/dayspan    
    return data
data = delta(data, ['cases', 'deaths', 'recovered', 'tests'])

# cast some columns to int
data = data.replace(np.nan,0.0000000000)
data[['cases','new_cases','deaths','new_deaths','recovered','new_recovered','population','tests','new_tests','terapia_intensiva','new_terapia_intensiva']] = data[['cases','new_cases','deaths','new_deaths','recovered','new_recovered','population','tests','new_tests','terapia_intensiva','new_terapia_intensiva']].astype(int)

data['weekid'] = data['date'].map(lambda date: date.isocalendar()[1]+date.year*100)

# ordering and casting/formatting columns   
data = data[['date','weekid','region','cases','new_cases','deaths','new_deaths','recovered','new_recovered','population',
             'tests','new_tests','terapia_intensiva','new_terapia_intensiva','new_terapia_intensiva_netta_min','new_terapia_intensiva_netta','new_terapia_intensiva_netta_max',
             'totale_ospedalizzati','ricoverati_con_sintomi','new_ricoverati_con_sintomi','isolamento_domiciliare','totale_positivi','new_totale_positivi',
             'cases_per100k','new_cases_per100k','deaths_per100k','new_deaths_per100k','recovered_per100k','new_recovered_per100k',
             'tests_per100k','new_tests_per100k','terapia_intensiva_per100k','new_terapia_intensiva_per100k','ricoverati_con_sintomi_per100k','new_ricoverati_con_sintomi_per100k',
             'cases_delta1','cases_delta2','cases_delta3','cases_delta4',
             'deaths_delta1','deaths_delta2','deaths_delta3','deaths_delta4',
             'recovered_delta1','recovered_delta2','recovered_delta3','recovered_delta4',
             'tests_delta1','tests_delta2','tests_delta3','tests_delta4',
            ]]

data.to_pickle('dataframes/mydata_dpc_ita.p')

In [3]:
#############################################
# Read data from github repo: https://github.com/pcm-dpc/COVID-19
# Dipartimento della Protezione Civile
#############################################
url = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv'
prov = pd.read_csv(url)

# Province #################
prov = prov.rename(columns={
    'data': 'date',
    'totale_casi': 'cases',
    'denominazione_provincia': 'provincia',
    'denominazione_regione': 'region'
})
prov['date'] = prov['date'].map(lambda date: datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S'))
prov['weekid'] = prov['date'].map(lambda date: date.isocalendar()[1]+date.year*100)
prov = prov.sort_values(['provincia', 'date', 'region'])

# remove "In fase di definizione/aggiornamento" rows
prov = prov[~prov.provincia.str.contains("In fase di definizione/aggiornamento")]

#add differential data
prov['new_cases'] = prov['cases'].diff()

# add tassi d'incremento
def delta(data, variables):
    '''Tasso di crescita mediato sugli ultimi 1,2,3,4 giorni'''
    for var in variables: 
        for dayspan in [1,2,3,4]:
            colname = var+'_delta'+str(dayspan)
            data[colname] = 0
            for i in np.arange(dayspan):
                data[colname] = data[colname] + data['new_'+var].shift(i)/data[var].shift(i+1)
            data[colname] = data[colname]/dayspan    
    return data
prov = delta(prov, ['cases'])
 
# Popolazione provinciale ###############
population_prov = {
    # sicilia
    'Enna': 164788,
    'Catania': 1107702,
    'Caltanissetta': 262458,
    'Palermo': 1252588,
    'Messina': 626876,
    'Agrigento': 434870,
    'Siracusa': 399224,
    'Ragusa': 320893,
    'Trapani': 430492,

    #'Bologna': 1e6,
    #'Bergamo': 1.11e6,
    #'Brescia': 1.26e6,
    #'Milano': 3.26e6,
    #'Rimini': 335e3,
    #'Roma': 4.35e6,
    #'Napoli': 3.12e6
}
# add population information
for provincia in population_prov.keys():
    prov.at[prov.provincia==provincia, 'population'] = population_prov[provincia]
prov['population'] = prov['population'].replace(np.nan, 9999999999)

# add data per 100k inhabitants
for var in ['cases','new_cases']:
    prov[var+'_per100k'] = prov.apply(lambda row: row[var]/row.population*1e5, axis = 1)

prov['population'] = prov['population'].replace(9999999999, np.nan)

#store dataframe
prov.to_pickle('dataframes/mydata_dpc_ita_province.p')


# Utility functions ###############
def sorted_set_prov(df, province, var):
    '''Ordina set rispetto alla variabile var'''
    df = df[df.provincia.isin(province)][df.date==lastday].sort_values(var, ascending=False)
    sortedset = df.provincia.unique()
    return sortedset