In [45]:
import os

import pandas as pd

In [46]:
INPUT_FOLDER = "raw"
OUTPUT_FOLDER = "preproc"

In [49]:
os.makedirs("../data/preproc", exist_ok=True)

In [50]:
datasources = {source.replace(".zip",""):source for source in os.listdir(f"../data/{INPUT_FOLDER}") if source.endswith(".zip")}
datasources

{'demographics': 'demographics.zip',
 'epidemiology': 'epidemiology.zip',
 'health': 'health.zip',
 'hospitalizations': 'hospitalizations.zip',
 'index': 'index.zip',
 'vaccinations': 'vaccinations.zip'}

## PREPROC

This part is a very important step in the process. In the preprocessing we're filtering tables individually, cretaing direct derived variables, remove bad rows, remove usudes columns and changing bad values, among others.

In this case, we're going to perform all of this operations
 - **epidemiology**
    - _Impute missing values_: Impute `new_confirmed` and `new_deceased`. Assume that these missings are 0.
    - _New derived variables_: Calculate `week` and `new_deceased_confirmed_ratio`
    - _Pick desired columns_: `date`, `location_key`, `new_confirmed`, `new_deceased`, `new_deceased_confirmed_ratio`
 - **demographics**
    - _Pick desired columns_: `location_key`, `population`, `population_age_*`
 - **health**
    - _Pick desired columns_: `location_key`, `life_expectancy`
 - **hospitalizations**
    - _New derived variables_: Calculate `week`
    - _Pick desired columns_: `date`, `week`, `location_key`, `new_hospitalized_patients`, `new_intensive_care_patients`, `new_ventilator_patients`, `current_hospitalized_patients`,`current_intensive_care_patients`, `current_ventilator_patients`
 - **vaccinations**
    - _New derived variables_: Calculate `week`
    - _Pick desired columns_: `date`, `week`, `location_key`, `new_persons_vaccinated`, `cumulative_persons_vaccinated`
 - **index**
    - _Pick desired columns_: `location_key`, `country_name`

#### Table: `epidemiology`

In [51]:
key = "epidemiology"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3161033 entries, 0 to 3161032
Data columns (total 10 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   date                  3161033 non-null  datetime64[ns]
 1   location_key          3161033 non-null  object        
 2   new_confirmed         3157609 non-null  float64       
 3   new_deceased          2730137 non-null  float64       
 4   new_recovered         298634 non-null   float64       
 5   new_tested            54782 non-null    float64       
 6   cumulative_confirmed  3161033 non-null  float64       
 7   cumulative_deceased   2733220 non-null  float64       
 8   cumulative_recovered  298634 non-null   float64       
 9   cumulative_tested     54840 non-null    float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 241.2+ MB


In [52]:
data.head()

Unnamed: 0,date,location_key,new_confirmed,new_deceased,new_recovered,new_tested,cumulative_confirmed,cumulative_deceased,cumulative_recovered,cumulative_tested
0,2020-03-15,DE_BB_12051,2.0,0.0,2.0,,2.0,0.0,2.0,
1,2020-03-17,DE_BB_12051,1.0,0.0,1.0,,3.0,0.0,3.0,
2,2020-03-19,DE_BB_12051,2.0,0.0,2.0,,5.0,0.0,5.0,
3,2020-03-20,DE_BB_12051,1.0,0.0,1.0,,6.0,0.0,6.0,
4,2020-03-22,DE_BB_12051,2.0,0.0,2.0,,8.0,0.0,8.0,


##### _Impute missing values_: Impute `new_confirmed` and `new_deceased`

In [53]:
data = data.fillna({
    "new_confirmed": 0,
    "new_deceased": 0
})

##### _New derived variables_: Calculate `week`, `new_deceased_confirmed_ratio`

In [54]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")
data["new_deceased_confirmed_ratio"] = data["new_deceased"] / data["new_confirmed"]

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_confirmed`, `new_deceased`, `new_deceased_confirmed_ratio`

In [55]:
data = data[["date", "week", "location_key", "new_confirmed", "new_deceased", "new_deceased_confirmed_ratio"]]

##### _Check data and save_

In [56]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3161033 entries, 0 to 3161032
Data columns (total 6 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   date                          3161033 non-null  datetime64[ns]
 1   week                          3161033 non-null  period[W-SUN] 
 2   location_key                  3161033 non-null  object        
 3   new_confirmed                 3161033 non-null  float64       
 4   new_deceased                  3161033 non-null  float64       
 5   new_deceased_confirmed_ratio  2197437 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1), period[W-SUN](1)
memory usage: 144.7+ MB


In [57]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### Table: `demographics`

In [None]:
key = "demographics"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### _Pick desired columns_: `location_key`, `population`, `population_age_*`

In [None]:
data = data.filter(regex=r"(location_key|population$|population_age)")

##### _Check table and save_

In [None]:
data.info(show_counts=True)

In [None]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### Table: `health`

In [None]:
key = "health"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### _Pick desired columns_: `location_key`, `life_expectancy`

In [None]:
data = data[["location_key","life_expectancy"]]

##### _Check table and save_

In [None]:
data.info(show_counts=True)

In [None]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `hospitalizations`

In [None]:
key = "hospitalizations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

##### _Impute missing values_: Impute `new_confirmed` and `new_deceased`

In [None]:
data = data.fillna({
    "new_hospitalized_patients": 0
})

##### _New derived variables_: Calculate `week`

In [None]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_hospitalized_patients`

In [None]:
data = data[["date", "week", "location_key", "new_hospitalized_patients"]]

##### _Check table and save_

In [None]:
data.info(show_counts=True)

In [None]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `vaccinations`

In [None]:
key = "vaccinations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])
data.info(show_counts=True)

##### _Impute missing values_: Impute `new_persons_fully_vaccinated`

In [None]:
data = data.fillna({
    "new_persons_fully_vaccinated": 0,
})

##### _New derived variables_: Calculate `week`

In [None]:
data["week"] = pd.DatetimeIndex(data.date).to_period("W")

##### _Pick desired columns_: `date`, `week`, `location_key`, `new_persons_fully_vaccinated`

In [None]:
data = data[["date", "week", "location_key", "new_persons_fully_vaccinated"]]

##### _Check table and save_

In [None]:
data.info()

In [None]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)

#### `index`

In [None]:
key = "index"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### Pick just the country name

In [None]:
data = data[["location_key","country_name"]]

##### _Check table and save_

In [None]:
data.info()

In [None]:
data.to_csv(f"../data/{OUTPUT_FOLDER}/{key}.zip", index=False)