In [1]:
# libraries 
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

# data directory
data_dir = "../../basins/"

# list of countries
countries = ['Sri Lanka', 'El Salvador', 'Morocco', 'Bolivia', 'Honduras',
             'Philippines', 'Indonesia', 'Pakistan', 'Rwanda', 'Bangladesh',
             'Kyrgyzstan', 'Egypt', 'Mozambique', 'Afghanistan', 'Angola',
             'Ghana', 'Zambia', "Côte d'Ivoire", 'Kenya', 'Uganda']


def parse_date(date_str):
    """
    This function parse dates
    :param date_str: string date formatted as mm/dd/yy
    :return: datetime dates
    """
    s = date_str.split("/")
    return datetime(2000 + int(s[2]), int(s[0]), int(s[1]))


def get_epi_data(country, df):
    """
    This function returns epidemic data for a given country (cases and deaths)
    :param country: country name
    :param df: global epi data DataFrame
    :return: epi data for given country
    """
    if country == "Myanmar":
        country_jhu = "Burma"
    elif country == "Côte d'Ivoire":
        country_jhu = "Cote d'Ivoire"
    else:
        country_jhu = country

    df_country = df.loc[df['Country/Region'] == country_jhu]
    dates, cumulative = [parse_date(c) for c in date_cols], df_country[date_cols].values[0]
    df_country = pd.DataFrame(data={"date": dates, "cumulative": cumulative})
    df_country["daily"] = df_country.cumulative.diff()
    return df_country

In [2]:
# import data
#url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"
url_deaths = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
url_cases = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths = pd.read_csv(url_deaths)
cases = pd.read_csv(url_cases)

# datetime colums
date_cols = [c for c in deaths.columns if c not in ['Province/State', 'Country/Region', 'Lat', 'Long']]

In [3]:
for country in countries:
    print(country)
    deaths_country = get_epi_data(country, deaths)
    deaths_country.to_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"), index=False)
    cases_country = get_epi_data(country, cases)
    cases_country.to_csv(os.path.join(data_dir, country, "epidemic-data/cases.csv"), index=False)

Kyrgyzstan


In [31]:
# correct errors

### ANOMALIES ###
country = 'Bolivia'
deaths = pd.read_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"))
deaths.date = pd.to_datetime(deaths.date)
date_anom = deaths.loc[deaths.daily > 1000]['date'].values[0]
deaths.loc[deaths.date == date_anom, 'daily'] = deaths.loc[deaths.date == pd.to_datetime(date_anom) - timedelta(days=2), 'daily'].values[0] 
deaths.to_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"), index=False)

country = 'Uganda'
deaths = pd.read_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"))
deaths.date = pd.to_datetime(deaths.date)
date_anom = deaths.loc[deaths.daily > 800]['date'].values[0]
deaths.loc[deaths.date == date_anom, 'daily'] = deaths.loc[deaths.date == pd.to_datetime(date_anom) - timedelta(days=2), 'daily'].values[0] 
deaths.to_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"), index=False)


### NEGATIVE VALUES ###
for country in ["Mozambique", "Philippines", "Uganda", "Angola", "Nigeria", "Myanmar", "Honduras", "Kyrgyzstan"]:
    deaths = pd.read_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"))
    deaths.date = pd.to_datetime(deaths.date)
    date_anom = deaths.loc[deaths.daily < 0]['date'].values[0]
    deaths.loc[deaths.date == date_anom, 'daily'] = deaths.loc[deaths.date == pd.to_datetime(date_anom) - timedelta(days=1), 'daily'].values[0] 
    deaths.to_csv(os.path.join(data_dir, country, "epidemic-data/deaths.csv"), index=False)