# <u> Covid-19 ECDC-Data Exploration </u>

In [1]:
import os
import wget
import pandas as pd
import matplotlib.pyplot as plt

## Download Most Recent and Load all Data Sets

In [2]:
# create data directory if it doesnt exist yet
if not 'data' in os.listdir():
    ! mkdir data

data_dir = './data/'

# clean data dir from old data (some sets are updated on a daily bases)
! rm data/*


# add urls to data file for new data sources here

# From European Centre for Disease Prevention and Control:
cases_deaths_url = 'https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv/data.csv'
hospitalization_url = 'https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv'
tests_url = 'https://opendata.ecdc.europa.eu/covid19/testing/csv/data.csv'
variants_url = 'https://opendata.ecdc.europa.eu/covid19/virusvariant/csv/data.csv'
vaccinations_url = 'https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv'

# add them to the dictionary and specify the desired file name
download_dict = {'cases_deaths.csv': cases_deaths_url, 'hospitalizations.csv': hospitalization_url,
                'tests.csv': tests_url, 'vaccinations.csv': vaccinations_url, 'variants.csv': variants_url}

for dict_item in download_dict.items():
    # download data file
    wget.download(dict_item[1], data_dir + dict_item[0])
    # load data frame named by filename
    df_name = dict_item[0].split('.')[0]
    globals()[df_name] = pd.read_csv(data_dir + str(dict_item[0])) # use string as variable name

rm: data/*: No such file or directory


In [3]:
print('Loaded: \n')
for filename in download_dict.keys():
    print(filename + ', with variables:' + '\n')
    print(globals()[filename.split('.')[0]].columns.values, '\n \n')

Loaded: 

cases_deaths.csv, with variables:

['dateRep' 'day' 'month' 'year' 'cases' 'deaths' 'countriesAndTerritories'
 'geoId' 'countryterritoryCode' 'popData2020' 'continentExp'] 
 

hospitalizations.csv, with variables:

['country' 'indicator' 'date' 'year_week' 'value' 'source' 'url'] 
 

tests.csv, with variables:

['country' 'country_code' 'year_week' 'level' 'region' 'region_name'
 'new_cases' 'tests_done' 'population' 'testing_rate' 'positivity_rate'
 'testing_data_source'] 
 

vaccinations.csv, with variables:

['YearWeekISO' 'FirstDose' 'FirstDoseRefused' 'SecondDose' 'UnknownDose'
 'NumberDosesReceived' 'Region' 'Population' 'ReportingCountry'
 'TargetGroup' 'Vaccine' 'Denominator'] 
 

variants.csv, with variables:

['country' 'country_code' 'year_week' 'source' 'new_cases'
 'number_sequenced' 'percent_cases_sequenced' 'valid_denominator'
 'variant' 'number_detections_variant' 'percent_variant'] 
 



## Overview over ECDPC Hospital Admission Data

In [5]:
hosp_indicators = hospitalizations.indicator.unique()
print('\n \n More specifically, we have data on: \n', hospitalizations.indicator.unique())

# isolate weekly icu admissions per 100k
hosp_by_indicator = hospitalizations.groupby('indicator')
weekly_icu = hosp_by_indicator.get_group(hosp_indicators[-1])

weekly_icu.info()
# join weekly_icu with testing data set
# weekly_icu.join(tests, on='year_week')


 
 More specifically, we have data on: 
 ['Daily hospital occupancy' 'Daily ICU occupancy'
 'Weekly new hospital admissions per 100k'
 'Weekly new ICU admissions per 100k']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 5047 to 25612
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1100 non-null   object 
 1   indicator  1100 non-null   object 
 2   date       1100 non-null   object 
 3   year_week  1100 non-null   object 
 4   value      1100 non-null   float64
 5   source     1100 non-null   object 
 6   url        0 non-null      object 
dtypes: float64(1), object(6)
memory usage: 68.8+ KB


In [6]:
# ICU-Data for how many and which countries?
countries = hospitalizations.country.unique()
icu_countries = weekly_icu.country.unique()
print('No. Countries Appearing in the Hospitalization Data Set:', len(countries), '\n \n', countries)
print('\n \n No. Countries with weekly ICU admission Data:', len(icu_countries), '\n \n', icu_countries)

No. Countries Appearing in the Hospitalization Data Set: 30 
 
 ['Austria' 'Belgium' 'Bulgaria' 'Croatia' 'Cyprus' 'Czechia' 'Denmark'
 'Estonia' 'Finland' 'France' 'Germany' 'Greece' 'Hungary' 'Iceland'
 'Ireland' 'Italy' 'Latvia' 'Liechtenstein' 'Lithuania' 'Luxembourg'
 'Malta' 'Netherlands' 'Norway' 'Poland' 'Portugal' 'Romania' 'Slovakia'
 'Slovenia' 'Spain' 'Sweden']

 
 No. Countries with weekly ICU admission Data: 15 
 
 ['Cyprus' 'Czechia' 'Estonia' 'France' 'Greece' 'Iceland' 'Ireland'
 'Latvia' 'Lithuania' 'Netherlands' 'Norway' 'Slovakia' 'Slovenia' 'Spain'
 'Sweden']


## Use Our World in Data Data Set Instead. It encompasses a wider range of variables.