This notebook handles the data preprocessing for the regional datasets of the countries Germany, Italy and Sweden.

Sources:

Germany: RKI https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/nCoV.html 
         under section "Daten zum Download" with the link "Dashboard-Daten"
         This file includes all the data that is used for the RKI Dashboard. The structure of each row are information
         about the amount of infections and deaths within an agegroup with a specific sex and within a specific county 
         for each day since January.
         Goal of preprocessing: To summarize the data to a format on a Bundesland level only considering the combined 
         amount of cases not broken down into age and sex.
         Finished!
         
Italy:   Dipartimento della Protezione Civile https://github.com/pcm-dpc/COVID-19
         under COVID-19/legacy/dati-regioni/dpc-covid19-ita-regioni.csv
         File contains the amount of new cases and other data for each day and for each Italian region

Sweden:  Folkhälsomyndigheten 
         https://www.folkhalsomyndigheten.se/smittskydd-beredskap/utbrott/aktuella-utbrott/covid-19/statistik-och-analyser/bekraftade-fall-i-sverige/
         under section Ladda ner data => Data som statistiken ovan bygger på kan laddas ner här (Excel)
         File contains the amount of new cases for each day for the whole country and for each Swedish region.

In [49]:
#import necessary libraries
import pandas as pd
import xlrd

In [9]:
#data for German Regions
#load data, select necessary columns and split data into sub dataframes based on Bundesland
data = pd.read_csv("https://opendata.arcgis.com/datasets/dd4580c810204019a7b8eb3e0b329dd6_0.csv")
data = data[["IdBundesland", "Bundesland", "AnzahlFall", "Meldedatum"]]
data["Meldedatum"] = data["Meldedatum"].apply(pd.to_datetime)


export = pd.DataFrame(columns=["Bundesland", "Cases", "Date"])

for i in range (1,17):
    
        df = pd.DataFrame(columns=["Bundesland", "Cases", "Date"])
        
        for date in data['Meldedatum'].drop_duplicates():
                
            case_aggregation = data.loc[(data['IdBundesland'] == i) & (data['Meldedatum'] == date)].reset_index(drop=True)
            cases_bundesland = case_aggregation["AnzahlFall"].sum()
            
            #try except in case that there is no data for a given date, it adds a row for no new cases on that given date
            try:
                new_row = {'Bundesland': case_aggregation.loc[0, "Bundesland"],
                           'Cases':cases_bundesland,
                           'Date':date}
            except KeyError:
                new_row = {'Bundesland': df.iloc[-1,0],
                           'Cases':0,
                           'Date':date} 
            df = df.append(new_row, ignore_index=True)
            
        export = export.append(df.sort_values(by=['Date']), ignore_index=True)
        
    
export.to_csv('German_Regional_Data_processed.csv', index=False)


Unnamed: 0,Bundesland,Cases,Date
0,Schleswig-Holstein,0,2020-01-28
1,Schleswig-Holstein,0,2020-01-29
2,Schleswig-Holstein,0,2020-01-31
3,Schleswig-Holstein,0,2020-02-03
4,Schleswig-Holstein,0,2020-02-04
...,...,...,...
3899,Thüringen,54,2020-10-10
3900,Thüringen,8,2020-10-11
3901,Thüringen,56,2020-10-12
3902,Thüringen,57,2020-10-13


In [41]:
#data for Italian Regions
#load data, and selecect necessary rows

ita_data = pd.read_csv("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/legacy/dati-province/dpc-covid19-ita-province.csv")
ita_data["data"] = ita_data["data"].apply(pd.to_datetime)

ita_export = ita_data[["data", "codice_regione", "denominazione_regione", "nuovi_positivi"]]
ita_export.to_csv('Italian_Regional_Data_processed.csv', index=False)
ita_export

Unnamed: 0,data,codice_regione,denominazione_regione,nuovi_positivi
0,2020-02-24 18:00:00,13,Abruzzo,0
1,2020-02-24 18:00:00,17,Basilicata,0
2,2020-02-24 18:00:00,21,P.A. Bolzano,0
3,2020-02-24 18:00:00,18,Calabria,0
4,2020-02-24 18:00:00,15,Campania,0
...,...,...,...,...
4930,2020-10-15 17:00:00,19,Sicilia,399
4931,2020-10-15 17:00:00,9,Toscana,581
4932,2020-10-15 17:00:00,10,Umbria,263
4933,2020-10-15 17:00:00,2,Valle d'Aosta,67


In [70]:
#data for Swedish Regions
#load data, and selecect necessary rows

swe_data = pd.read_excel("https://www.arcgis.com/sharing/rest/content/items/b5e7488e117749c19881cce45db13f7e/data", sheet_name='Antal per dag region')
swe_data.columns = swe_data.columns.astype(str)
swe_data['Statistikdatum'] = swe_data['Statistikdatum'].apply(pd.to_datetime)
print(swe_data.columns)
swe_export = pd.DataFrame(columns=["Statistikdatum", "Region", "Cases"])

for i in range (2,23): #iterate over the different region
    
    for j in range (1,len(swe_data)): #iterate over the dates
        new_row = {'Statistikdatum': swe_data.loc[j, 'Statistikdatum'],
                           'Region': swe_data.columns.values[i],
                           'Cases': swe_data.iloc[j,i]}
    
        swe_export = swe_export.append(new_row, ignore_index=True )

swe_export.to_csv('Swedish_Regional_Data_processed.csv', index=False)
swe_export

Index(['Statistikdatum', 'Totalt_antal_fall', 'Blekinge', 'Dalarna', 'Gotland',
       'Gävleborg', 'Halland', 'Jämtland_Härjedalen', 'Jönköping', 'Kalmar',
       'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Sörmland', 'Uppsala',
       'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland',
       'Västra_Götaland', 'Örebro', 'Östergötland'],
      dtype='object')


Unnamed: 0,Statistikdatum,Region,Cases
0,2020-02-05,Blekinge,0
1,2020-02-06,Blekinge,0
2,2020-02-07,Blekinge,0
3,2020-02-08,Blekinge,0
4,2020-02-09,Blekinge,0
...,...,...,...
5308,2020-10-10,Östergötland,16
5309,2020-10-11,Östergötland,9
5310,2020-10-12,Östergötland,46
5311,2020-10-13,Östergötland,85


Unnamed: 0,ObjectId,IdBundesland,Bundesland,Landkreis,Altersgruppe,Geschlecht,AnzahlFall,AnzahlTodesfall,Meldedatum,IdLandkreis,Datenstand,NeuerFall,NeuerTodesfall,Refdatum,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn,Altersgruppe2
0,44082977,1,Schleswig-Holstein,SK Flensburg,A00-A04,M,1,0,2020/09/30 00:00:00,1001,"15.10.2020, 00:00 Uhr",0,-9,2020/09/30 00:00:00,0,1,0,Nicht übermittelt
1,44082978,1,Schleswig-Holstein,SK Flensburg,A00-A04,W,1,0,2020/08/24 00:00:00,1001,"15.10.2020, 00:00 Uhr",0,-9,2020/08/24 00:00:00,0,1,0,Nicht übermittelt
2,44082979,1,Schleswig-Holstein,SK Flensburg,A00-A04,W,1,0,2020/09/26 00:00:00,1001,"15.10.2020, 00:00 Uhr",0,-9,2020/09/26 00:00:00,0,1,0,Nicht übermittelt
3,44082980,1,Schleswig-Holstein,SK Flensburg,A05-A14,M,1,0,2020/09/25 00:00:00,1001,"15.10.2020, 00:00 Uhr",0,-9,2020/09/21 00:00:00,0,1,1,Nicht übermittelt
4,44082981,1,Schleswig-Holstein,SK Flensburg,A05-A14,M,1,0,2020/09/26 00:00:00,1001,"15.10.2020, 00:00 Uhr",0,-9,2020/09/21 00:00:00,0,1,1,Nicht übermittelt
