In [81]:
import os

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport

In [82]:
INPUT_FOLDER = "enrich"
OUTPUT_FOLDER = "aggregate"

In [83]:
os.makedirs("../data/aggregate", exist_ok=True)

In [84]:
datasources = {source.replace(".zip",""):source for source in os.listdir(f"../data/{INPUT_FOLDER}") if source.endswith(".zip")}
datasets = {}

## AGGREGATE

### Create BASE TABLE

The base table is just a table with all the primary keys we want to maintain in the macro table. In this case we want to build a table indexed by `week` and `country_name`.
 - `week` - We will take the `epidemiology` table to get the start and end dates.
 - `country_name` - We get the `country_name` from the `index` table.
 
So, in this case, we have to build an index with all dates and all country names.

<div class="alert alert-warning">
    <b>Special mention to datetime indexes</b>: Be careful when building detetime indexes from some of the provided data tables, they may contain gaps in the timestamps. So my recommendation here is that you build the date ranges by yourself. Then, you will have time to spot missing dates and fix them
</div>

#### Get date ranges

In [None]:
key = "epidemiology"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip", parse_dates=["date"])

start_date = data.date.min()
end_date = data.date.max()
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")

In [None]:
dates = pd.date_range(start=start_date, end=end_date, freq="W")
dates = dates.to_period("W").astype(str)
dates = pd.DataFrame(dates)

#### Get country names

In [None]:
key = "index"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")

In [None]:
countries = pd.DataFrame(data.country_name.unique())

In [None]:
dates

#### Build the index

In [None]:
base = dates.merge(countries, how="cross")
base.columns = ["week", "country_name"]
base = base.set_index(["week", "country_name"])

In [None]:
base.head()

In [None]:
base.shape

In [None]:
macro = base.copy()

### Create MACRO TABLE

#### Incorporate `epidemiology`

In [None]:
key = "epidemiology"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### _Aggregate_: (sum) `new_confirmed`, `new_deceased`, `new_recovered`

In [None]:
aux = data.groupby(["week","country_name"]).agg({
    "new_confirmed": "sum",
    "new_deceased": "sum",
    "new_deceased_confirmed_ratio": "mean",
})

##### _Include in the base table_

In [None]:
macro = macro.join(aux)

In [None]:
macro.info()

#### `demographics`

In [None]:
key = "demographics"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info()

##### _Aggregate_: (sum)

In [None]:
aux = data.groupby("country_name").sum()

In [None]:
macro = macro.join(aux, on="country_name")

In [None]:
macro.info()

#### `health`

In [None]:
key = "health"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info()

##### _Aggregate_: (mean)

In [None]:
aux = data.groupby(["country_name"]).agg({
    "life_expectancy": "mean",
})

##### _Include in the base table_

In [None]:
macro = macro.join(aux, on="country_name")

In [None]:
macro.info()

#### `hospitalizations`

In [None]:
key = "hospitalizations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### _Aggregate_: (sum)

In [None]:
aux = data.groupby(["week","country_name"]).sum()

In [None]:
macro = macro.join(aux)

In [None]:
macro.info()

#### `vaccinations`

In [None]:
key = "vaccinations"
data = pd.read_csv(f"../data/{INPUT_FOLDER}/{key}.zip")
data.info(show_counts=True)

##### _Aggregate_: (sum)

In [None]:
data = data.sort_index(level="date")
aux = data.groupby(["week","country_name"]).agg({
    "new_persons_fully_vaccinated": sum
})

In [None]:
macro = macro.join(aux)

In [None]:
macro.info()

### Save `macrotable`

In [None]:
macro.to_csv(f"../data/{OUTPUT_FOLDER}/macrotable.zip", index=True)

In [None]:
profile = ProfileReport(macro, minimal=True)
profile.to_file("../profiling/macro.html")

In [None]:
macro.info()

### Final missing values review

At this moment, some missing values can arise from the resulting aggregations. This is the time to decide what to do with them to buil a clean table for EDA.

In this case:
 - `new_confirmed` and `new_deceased`: These missings are because some countries doesn't have the starting date at the same time as the country with the earliest record.
    - **Action: Remove all missing rows**
 - `new_deceased_confirmed_ratio`: This value doesn't have missing values, but infinite values due to dividing by 0.
    - **Action: Impute the value as 0**
 - `life_expectancy`: These missings are because this variable just applies to the United States. For those missings related to the United States, impute values to the mean
    - **Action: Impute values to the mean, but just for the United States**
 - `new_hospitalized_patients`: These missings are because this variable just applies to the United States. For those missings related to the United States, impute values to the mean
    - **Action: Impute values to the mean, but just for the United States**
 - `new_persons_fully_vaccinated`: These missings are because this variable just applies to the United States. For those missings related to the United States, impute values to 0 (there was no vaccination.) 
    - **Action: Impute values to 0, but just for the United States**

In [None]:
clean_macro = macro.copy()

Imputing `new_confirmed`

In [None]:
clean_macro = clean_macro.dropna(subset=["new_confirmed"])

Imputing `new_deceased_confirmed_ratio`

In [None]:
clean_macro = clean_macro.fillna({
    "new_deceased_confirmed_ratio": 0
})

Imputing `life_expectancy` and `new_hospitalized_patients`

In [None]:
us_missings = clean_macro.query("country_name == 'United States of America'")[["life_expectancy","new_hospitalized_patients"]]
us_missings.head()

In [None]:
us_missings = us_missings.groupby(level="country_name").mean()
us_missings

In [None]:
clean_macro = clean_macro.fillna(us_missings)

Imputing `new_persons_fully_vaccinated`

In [None]:
clean_macro.loc[
    (clean_macro.index.get_level_values("country_name")=="United States of America") & (clean_macro.new_persons_fully_vaccinated.isna()),
    "new_persons_fully_vaccinated"
] = 0

In [None]:
clean_macro.info()

### Save cleaned macrotable

In [None]:
clean_macro.to_csv(f"../data/{OUTPUT_FOLDER}/macrotable_c.zip", index=True)