In [3]:
import os

import pandas as pd

In [4]:
INPUT_FOLDER = "raw"
OUTPUT_FOLDER = "preproc"

In [5]:
os.makedirs("../data/preproc", exist_ok=True)

In [6]:
datasources = {source.replace(".zip",""):source for source in os.listdir(f"../data/{INPUT_FOLDER}") if source.endswith(".zip")}
datasources

{'demographics': 'demographics.zip',
 'epidemiology': 'epidemiology.zip',
 'health': 'health.zip',
 'hospitalizations': 'hospitalizations.zip',
 'index': 'index.zip',
 'vaccinations': 'vaccinations.zip'}

## PREPROC

This part is a very important step in the process. In the preprocessing we're filtering tables individually, cretaing direct derived variables, remove bad rows, remove usudes columns and changing bad values, among others.

In this case, we're going to perform all of this operations
 - **epidemiology**
    - _Impute missing values_: Impute `new_confirmed` and `new_deceased`. Assume that these missings are 0.
    - _New derived variables_: Calculate `week` and `new_deceased_confirmed_ratio`
    - _Pick desired columns_: `date`, `location_key`, `new_confirmed`, `new_deceased`, `new_deceased_confirmed_ratio`
 - **demographics**
    - _Pick desired columns_: `location_key`, `population`, `population_age_*`
 - **health**
    - _Pick desired columns_: `location_key`, `life_expectancy`
 - **hospitalizations**
    - _New derived variables_: Calculate `week`
    - _Pick desired columns_: `date`, `week`, `location_key`, `new_hospitalized_patients`, `new_intensive_care_patients`, `new_ventilator_patients`, `current_hospitalized_patients`,`current_intensive_care_patients`, `current_ventilator_patients`
 - **vaccinations**
    - _New derived variables_: Calculate `week`
    - _Pick desired columns_: `date`, `week`, `location_key`, `new_persons_vaccinated`, `cumulative_persons_vaccinated`
 - **index**
    - _Pick desired columns_: `location_key`, `country_name`

#### Table: `epidemiology`

In [9]:
key = "epidemiology"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3161033 entries, 0 to 3161032
Data columns (total 10 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   date                  3161033 non-null  datetime64[ns]
 1   location_key          3161033 non-null  object        
 2   new_confirmed         3157609 non-null  float64       
 3   new_deceased          2730137 non-null  float64       
 4   new_recovered         298634 non-null   float64       
 5   new_tested            54782 non-null    float64       
 6   cumulative_confirmed  3161033 non-null  float64       
 7   cumulative_deceased   2733220 non-null  float64       
 8   cumulative_recovered  298634 non-null   float64       
 9   cumulative_tested     54840 non-null    float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 241.2+ MB


In [10]:
data.head()

Unnamed: 0,date,location_key,new_confirmed,new_deceased,new_recovered,new_tested,cumulative_confirmed,cumulative_deceased,cumulative_recovered,cumulative_tested
0,2020-03-15,DE_BB_12051,2.0,0.0,2.0,,2.0,0.0,2.0,
1,2020-03-17,DE_BB_12051,1.0,0.0,1.0,,3.0,0.0,3.0,
2,2020-03-19,DE_BB_12051,2.0,0.0,2.0,,5.0,0.0,5.0,
3,2020-03-20,DE_BB_12051,1.0,0.0,1.0,,6.0,0.0,6.0,
4,2020-03-22,DE_BB_12051,2.0,0.0,2.0,,8.0,0.0,8.0,


##### _Impute missing values_: Impute `new_confirmed` and `new_deceased`

In [11]:
data = data.fillna({
    "new_confirmed": 0,
    "new_deceased": 0
})

##### _New derived variables_: Calculate `week`, `new_deceased_confirmed_ratio`

In [12]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")
data["new_deceased_confirmed_ratio"] = data["new_deceased"] / data["new_confirmed"]

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_confirmed`, `new_deceased`, `new_deceased_confirmed_ratio`

In [13]:
data = data[["date", "week", "location_key", "new_confirmed", "new_deceased", "new_deceased_confirmed_ratio"]]

##### _Check data and save_

In [14]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3161033 entries, 0 to 3161032
Data columns (total 6 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   date                          3161033 non-null  datetime64[ns]
 1   week                          3161033 non-null  period[W-SUN] 
 2   location_key                  3161033 non-null  object        
 3   new_confirmed                 3161033 non-null  float64       
 4   new_deceased                  3161033 non-null  float64       
 5   new_deceased_confirmed_ratio  2197437 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1), period[W-SUN](1)
memory usage: 144.7+ MB


In [15]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### Table: `demographics`

In [16]:
key = "demographics"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5097 entries, 0 to 5096
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   location_key                 5097 non-null   object 
 1   population                   5097 non-null   float64
 2   population_male              3743 non-null   float64
 3   population_female            3743 non-null   float64
 4   population_rural             0 non-null      float64
 5   population_urban             0 non-null      float64
 6   population_largest_city      0 non-null      float64
 7   population_clustered         0 non-null      float64
 8   population_density           507 non-null    float64
 9   human_development_index      0 non-null      float64
 10  population_age_00_09         3743 non-null   float64
 11  population_age_10_19         3743 non-null   float64
 12  population_age_20_29         3743 non-null   float64
 13  population_age_30_

##### _Pick desired columns_: `location_key`, `population`, `population_age_*`

In [17]:
data = data.filter(regex=r"(location_key|population$|population_age)")

##### _Check table and save_

In [18]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5097 entries, 0 to 5096
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   location_key                 5097 non-null   object 
 1   population                   5097 non-null   float64
 2   population_age_00_09         3743 non-null   float64
 3   population_age_10_19         3743 non-null   float64
 4   population_age_20_29         3743 non-null   float64
 5   population_age_30_39         3743 non-null   float64
 6   population_age_40_49         3743 non-null   float64
 7   population_age_50_59         3743 non-null   float64
 8   population_age_60_69         3743 non-null   float64
 9   population_age_70_79         3743 non-null   float64
 10  population_age_80_and_older  3743 non-null   float64
dtypes: float64(10), object(1)
memory usage: 438.1+ KB


In [19]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### Table: `health`

In [20]:
key = "health"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   location_key                          3022 non-null   object 
 1   life_expectancy                       3022 non-null   float64
 2   smoking_prevalence                    0 non-null      float64
 3   diabetes_prevalence                   0 non-null      float64
 4   infant_mortality_rate                 0 non-null      float64
 5   adult_male_mortality_rate             0 non-null      float64
 6   adult_female_mortality_rate           0 non-null      float64
 7   pollution_mortality_rate              0 non-null      float64
 8   comorbidity_mortality_rate            0 non-null      float64
 9   hospital_beds_per_1000                0 non-null      float64
 10  nurses_per_1000                       0 non-null      float64
 11  physicians_per_10

##### _Pick desired columns_: `location_key`, `life_expectancy`

In [24]:
data = data[["location_key","life_expectancy"]]

##### _Check table and save_

In [25]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location_key     3022 non-null   object 
 1   life_expectancy  3022 non-null   float64
dtypes: float64(1), object(1)
memory usage: 47.3+ KB


In [26]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `hospitalizations`

In [27]:
key = "hospitalizations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6297 entries, 0 to 6296
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   date                                6297 non-null   datetime64[ns]
 1   location_key                        6297 non-null   object        
 2   new_hospitalized_patients           5418 non-null   float64       
 3   cumulative_hospitalized_patients    5418 non-null   float64       
 4   current_hospitalized_patients       879 non-null    float64       
 5   new_intensive_care_patients         0 non-null      float64       
 6   cumulative_intensive_care_patients  0 non-null      float64       
 7   current_intensive_care_patients     879 non-null    float64       
 8   new_ventilator_patients             0 non-null      float64       
 9   cumulative_ventilator_patients      0 non-null      float64       
 10  current_ventilator_patie

##### _Impute missing values_: Impute `new_confirmed` and `new_deceased`

In [28]:
data = data.fillna({
    "new_hospitalized_patients": 0
})

##### _New derived variables_: Calculate `week`

In [29]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_hospitalized_patients`

In [30]:
data = data[["date", "week", "location_key", "new_hospitalized_patients"]]

##### _Check table and save_

In [31]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6297 entries, 0 to 6296
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date                       6297 non-null   datetime64[ns]
 1   week                       6297 non-null   period[W-SUN] 
 2   location_key               6297 non-null   object        
 3   new_hospitalized_patients  6297 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1), period[W-SUN](1)
memory usage: 196.9+ KB


In [32]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `vaccinations`

In [33]:
key = "vaccinations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1562414 entries, 0 to 1562413
Data columns (total 32 columns):
 #   Column                                         Non-Null Count    Dtype         
---  ------                                         --------------    -----         
 0   date                                           1562414 non-null  datetime64[ns]
 1   location_key                                   1562414 non-null  object        
 2   new_persons_vaccinated                         0 non-null        float64       
 3   cumulative_persons_vaccinated                  0 non-null        float64       
 4   new_persons_fully_vaccinated                   1559194 non-null  float64       
 5   cumulative_persons_fully_vaccinated            1562414 non-null  float64       
 6   new_vaccine_doses_administered                 0 non-null        float64       
 7   cumulative_vaccine_doses_administered          0 non-null        float64       
 8   new_persons_vaccinated_pfizer   

##### _Impute missing values_: Impute `new_persons_fully_vaccinated`

In [35]:
data = data.fillna({
    "new_persons_fully_vaccinated": 0,
})

##### _New derived variables_: Calculate `week`

In [36]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_persons_fully_vaccinated`

In [37]:
data = data[["date", "week", "location_key", "new_persons_fully_vaccinated"]]

##### _Check table and save_

In [None]:
data.info()

In [38]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `index`

In [39]:
key = "index"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5121 entries, 0 to 5120
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   location_key        5121 non-null   object
 1   place_id            5080 non-null   object
 2   wikidata_id         5098 non-null   object
 3   datacommons_id      3329 non-null   object
 4   country_code        5121 non-null   object
 5   country_name        5121 non-null   object
 6   subregion1_code     5121 non-null   object
 7   subregion1_name     5121 non-null   object
 8   subregion2_code     5109 non-null   object
 9   subregion2_name     5109 non-null   object
 10  locality_code       12 non-null     object
 11  locality_name       12 non-null     object
 12  iso_3166_1_alpha_2  5121 non-null   object
 13  iso_3166_1_alpha_3  5121 non-null   object
 14  aggregation_level   5121 non-null   int64 
dtypes: int64(1), object(14)
memory usage: 600.2+ KB


##### Pick just the country name

In [40]:
data = data[["location_key","country_name"]]

##### _Check table and save_

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5121 entries, 0 to 5120
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   location_key  5121 non-null   object
 1   country_name  5121 non-null   object
dtypes: object(2)
memory usage: 80.1+ KB


In [42]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

In [43]:
data

Unnamed: 0,location_key,country_name
0,DE_BB_12051,Germany
1,DE_BB_12052,Germany
2,DE_BB_12053,Germany
3,DE_BB_12054,Germany
4,DE_BB_12060,Germany
...,...,...
5116,US_WY_56037,United States of America
5117,US_WY_56039,United States of America
5118,US_WY_56041,United States of America
5119,US_WY_56043,United States of America
