# Create final dataset for DLL COVID-19 competition

## Functions for preparing data

In [1]:
import os
from pathlib import Path
import urllib.request

import pandas as pd
import numpy as np
import io
import requests


def download_world_bank_indicator(indicator_name: str, directory: str = "."):
    directory = Path(directory)
    file_path = directory / (indicator_name + ".csv")
    os.makedirs(directory, exist_ok=True)
    data_url = f"https://api.worldbank.org/indicator/{indicator_name}?format=csv"
    urllib.request.urlretrieve(data_url, file_path)


def extract_series_of_newest_data(csv_path: str):
    df = pd.read_csv(csv_path, index_col=[0, 1], header=0).T
    recent_year = df.apply(pd.Series.last_valid_index)

    nan_indicies = recent_year[recent_year.isna()].index
    no_na_df = df.drop(nan_indicies, axis=1)

    recent_year = no_na_df.apply(pd.Series.last_valid_index)
    recent_data = no_na_df.lookup(recent_year, no_na_df.columns)
    s = pd.Series(recent_data)
    s.index = no_na_df.columns
    return s


def add_new_feature(
    main_df: pd.DataFrame, additional_feature: pd.Series, name: str
) -> pd.DataFrame:
    tmp_df = main_df.set_index("countryterritoryCode")
    additional_feature = additional_feature.reset_index().set_index("Country Code")
    tmp_df[name] = additional_feature[0]
    return tmp_df.reset_index()

### Add here new features from the World Bank (https://data.worldbank.org/indicator?tab=all)

In [2]:
indicator_names = {
    "GDP (current US$)": "NY.GDP.MKTP.CD",
    "GDP per capita (current US$)": "NY.GDP.PCAP.CD",
    "Access to electricity (% of population)": "EG.ELC.ACCS.ZS",
    "Current health expenditure per capita (current US$)": "SH.XPD.CHEX.PC.CD",
    "Current health expenditure (% of GDP)": "SH.XPD.CHEX.GD.ZS",
    "Hospital beds (per 1,000 people)": "SH.MED.BEDS.ZS",
}

### Data dir and output dataset path

In [3]:
data_dir = Path("data")
dataset_output_path = "DLL_COVID_TRAIN.csv"

In [5]:
df = pd.read_csv(
    io.StringIO(
        requests.get(
            "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
        ).content.decode("utf-8")
    ),
    usecols=[
        "dateRep",
        "cases",
        "deaths",
        "countriesAndTerritories",
        "popData2018",
        "countryterritoryCode",
    ],
    parse_dates=["dateRep"],
    infer_datetime_format=True,
)

for key, value in indicator_names.items():
    download_world_bank_indicator(value, directory=data_dir)
    csv_path = data_dir / (value + ".csv")
    new_feature = extract_series_of_newest_data(csv_path)
    df = add_new_feature(df, new_feature, key)
    
df = df.dropna(subset=["countryterritoryCode"])
df.to_csv(dataset_output_path, index=False)

In [6]:
df

Unnamed: 0,countryterritoryCode,dateRep,cases,deaths,countriesAndTerritories,popData2018,GDP (current US$),GDP per capita (current US$),Access to electricity (% of population),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),"Hospital beds (per 1,000 people)"
0,AFG,2020-04-17,10,4,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.12265,11.777194,0.5
1,AFG,2020-04-16,70,2,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.12265,11.777194,0.5
2,AFG,2020-04-15,49,2,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.12265,11.777194,0.5
3,AFG,2020-04-14,58,3,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.12265,11.777194,0.5
4,AFG,2020-04-13,52,0,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.12265,11.777194,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
11353,ZWE,2020-03-25,0,0,Zimbabwe,14439018.0,3.100052e+10,2146.996385,40.421368,110.14962,6.635916,1.7
11354,ZWE,2020-03-24,0,1,Zimbabwe,14439018.0,3.100052e+10,2146.996385,40.421368,110.14962,6.635916,1.7
11355,ZWE,2020-03-23,0,0,Zimbabwe,14439018.0,3.100052e+10,2146.996385,40.421368,110.14962,6.635916,1.7
11356,ZWE,2020-03-22,1,0,Zimbabwe,14439018.0,3.100052e+10,2146.996385,40.421368,110.14962,6.635916,1.7
