In [1]:
# libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
sys.path.append("../functions/")
from plot_style import plot_style
colors = plot_style("../functions/fonts/")
from import_functions import *

# first and last Monday
start_date, end_date  = datetime(2020, 3, 2), datetime(2020, 12, 21)
window = timedelta(days=7)
dates = pd.to_datetime(np.arange(start_date, end_date + window, window))
path_to_data = "../../data/"

The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  font_list = font_manager.createFontList(font_files)


# Create datasets for regression

In [2]:
def create_dataset_static(country, covs, popcol):

    """
    Thi function create the dataset for the regression at the peak of mobility change
    :param country: name of the country
    :param covs: name of independent variables
    :param popcol: population column
    :return: dataframe of the dataset
    """

    # import data
    gadm2, maps, policy, epidata = import_gadm(country, path_to_data=path_to_data), import_range_maps(country, path_to_data=path_to_data), import_policy(country, path_to_data=path_to_data), import_epi(country, path_to_data=path_to_data)
    #maps = maps.loc[maps.polygon_id.isin(gadm2.loc[gadm2.mean_mov.notnull()].GID_2.values)]

    # keep gadm2 for which we have mobility data
    gadm2 = gadm2.loc[gadm2.GID_2.isin(maps.polygon_id.unique())]

    # mobility - keep only 2020
    maps = maps.loc[(maps.ds >= datetime(2020, 1, 1)) & (maps.ds < datetime(2021, 1, 1))]

    # mobility - remove weekends
    maps = maps.loc[maps.ds.dt.dayofweek < 5].reset_index(drop=True)

    # get maximum drawdown in mobility and stay at home
    movs = {}
    for date in dates:
        # loc data in this week
        maps_date = maps.loc[(maps.ds >= date) & (maps.ds < date + timedelta(days=5))]
        epidata_date = epidata.loc[(epidata.date >= date) & (epidata.date < date + timedelta(days=7))]
        
        # iterate over municipalities
        for gid_2 in maps_date.polygon_id.unique():
            maps_date_gid2 = maps_date.loc[maps_date.polygon_id == gid_2]
            epidata_date_gid2 = epidata_date.loc[epidata_date.GID_2 == gid_2]

            # keep only municip. with full week data
            if maps_date_gid2.shape[0] == 5:
                if gid_2 not in movs.keys():
                    movs[gid_2] = {"cases": [], "movs": [], "stay": []}
                movs[gid_2]["movs"].append(-100 * maps_date_gid2.all_day_bing_tiles_visited_relative_change.mean())
                movs[gid_2]["stay"].append(100 * maps_date_gid2.all_day_ratio_single_tile_users.mean())
                movs[gid_2]["cases"].append(epidata_date_gid2.new_cases.sum())

    data = {i: [] for i in np.concatenate((["max_movs", "cases"], covs))}
    data_stay = {i: [] for i in np.concatenate((["max_stay", "cases"], covs))}

    for gid_2 in movs.keys():
        # keep only municipalities withh full 2020
        if len(movs[gid_2]["movs"]) == len(dates):
            idx = np.argmax(movs[gid_2]["movs"])
            data["max_movs"].append(movs[gid_2]["movs"][idx])
            data["cases"].append(movs[gid_2]["cases"][idx])
            # add other features for this municip.
            for cov in covs:
                data[cov].append(gadm2.loc[gadm2.GID_2 == gid_2][cov].values[0])

        if len(movs[gid_2]["stay"]) == len(dates):
            idx = np.argmax(movs[gid_2]["stay"])
            data_stay["max_stay"].append(movs[gid_2]["stay"][idx])
            data_stay["cases"].append(movs[gid_2]["cases"][idx])
            # add other features for this municip.
            for cov in covs:
                data_stay[cov].append(gadm2.loc[gadm2.GID_2 == gid_2][cov].values[0])

    # standardize and preprocess
    df_mov = pd.DataFrame(data=data)
    df_mov_std = preprocess(df_mov, popcol=popcol)

    df_stay = pd.DataFrame(data=data_stay)
    df_stay_std = preprocess(df_stay, popcol=popcol)

    return df_mov, df_stay, df_mov_std, df_stay_std


def create_dataset_time(country, covs, popcol):

    """
    Thi function create the dataset for the weekly regression
    :param country: name of the country
    :param covs: name of independent variables
    :param popcol: population column
    :return: dataframe of the dataset
    """

    # import data
    gadm2, maps, policy, epidata = import_gadm(country, path_to_data=path_to_data), import_range_maps(country, path_to_data=path_to_data), import_policy(country, path_to_data=path_to_data), import_epi(country, path_to_data=path_to_data)
    #maps = maps.loc[maps.polygon_id.isin(gadm2.loc[gadm2.mean_mov.notnull()].GID_2.values)]

    # keep gadm2 for which we have mobility data
    gadm2 = gadm2.loc[gadm2.GID_2.isin(maps.polygon_id.unique())]

    # mobility - keep only 2020
    maps = maps.loc[(maps.ds >= datetime(2020, 1, 1)) & (maps.ds < datetime(2021, 1, 1))]

    # mobility - remove weekends
    maps = maps.loc[maps.ds.dt.dayofweek < 5].reset_index(drop=True)

    # get weekly drawdown in mobility
    movs = {}
    for date in dates:
        # loc data in this week
        maps_date = maps.loc[(maps.ds >= date) & (maps.ds < date + timedelta(days=5))]
        epidata_date = epidata.loc[(epidata.date >= date) & (epidata.date < date + timedelta(days=7))]

        # iterate over municipalities
        for gid_2 in maps_date.polygon_id.unique():
            maps_date_gid2 = maps_date.loc[maps_date.polygon_id == gid_2]
            epidata_date_gid2 = epidata_date.loc[epidata_date.GID_2 == gid_2]

            # keep only municip. with full week data
            if maps_date_gid2.shape[0] == 5:
                if gid_2 not in movs.keys():
                    movs[gid_2] = {"cases": [], "movs": [], "stay": []}
                movs[gid_2]["movs"].append(-100 * maps_date_gid2.all_day_bing_tiles_visited_relative_change.mean())
                movs[gid_2]["stay"].append(100 * maps_date_gid2.all_day_ratio_single_tile_users.mean())
                movs[gid_2]["cases"].append(epidata_date_gid2.new_cases.sum())

    data = {i: [] for i in np.concatenate((["max_movs", "max_stay", "cases", "week"], covs))}

    for gid_2 in movs.keys():
        if len(movs[gid_2]["movs"]) == len(dates):
            data["max_movs"].extend(movs[gid_2]["movs"])
            data["max_stay"].extend(movs[gid_2]["stay"])
            data["cases"].extend(movs[gid_2]["cases"])
            data["week"].extend(range(len(dates)))
            # add other features for this municip.
            for cov in covs:
                data[cov].extend(len(dates) * [gadm2.loc[gadm2.GID_2 == gid_2][cov].values[0]])

    df = pd.DataFrame(data=data)
    df_std = preprocess(df, popcol=popcol, avoid_cols=['week'])
    return df, df_std


def preprocess(df, popcol, cases_col="cases", avoid_cols=[]):
    """
    This function standardize the variables and perform some preprocessing
    """

    df_r = df.copy()
    # cases per 100k
    df_r[cases_col] = 100000 * df_r[cases_col] / df_r[popcol]
    #df_r['tests_unique_dev_fixed'] = df_r['tests_unique_dev_fixed'] / df_r[popcol]
    df_r['tests_unique_dev_fixed'] = np.log(100 * df_r['tests_unique_dev_fixed'] / df_r[popcol])
             
    # log of popluation
    df_r[popcol] = np.log(df_r[popcol])

    for c in df_r.columns:
        if c not in avoid_cols:
            df_r[c] = (df_r[c] - df_r[c].mean()) / df_r[c].std()

    # keep rows with no nans
    df_r = df_r.loc[~df_r.isnull().any(axis=1)]
    return df_r

- Static

In [5]:
# colombia
df_col_movs, df_col_stay, df_col_movs_std, df_col_stay_std = create_dataset_static("colombia", ["download_mbps_fixed", "rwi_weight", "gdp_per_capita", "popDANE",
                                                                                                "pop_density", "pop60plus_ratio", 'tests_unique_dev_fixed', 
                                                                                                "internet_pen", "labor_formality_index", 
                                                                                                "primary_ratio", "secondary_ratio", "tertiary_ratio"], popcol="popDANE")
df_col_movs.to_csv("./input-dfs-static/colombia_movs.csv", index=False)
df_col_stay.to_csv("./input-dfs-static/colombia_stay.csv", index=False)
df_col_movs_std.to_csv("./input-dfs-static/colombia_movs_std.csv", index=False)
df_col_stay_std.to_csv("./input-dfs-static/colombia_stay_std.csv", index=False)


df_ecu_movs, df_ecu_stay, df_ecu_movs_std, df_ecu_stay_std = create_dataset_static("ecuador", ["download_mbps_fixed", "rwi_weight", "pop2020", "pop_density", 
                                                                                               "pop60plus_ratio", 'tests_unique_dev_fixed'], popcol="pop2020")
df_ecu_movs.to_csv("./input-dfs-static/ecuador_movs.csv", index=False)
df_ecu_stay.to_csv("./input-dfs-static/ecuador_stay.csv", index=False)
df_ecu_movs_std.to_csv("./input-dfs-static/ecuador_movs_std.csv", index=False)
df_ecu_stay_std.to_csv("./input-dfs-static/ecuador_stay_std.csv", index=False)

df_slv_movs, df_slv_stay, df_slv_movs_std, df_slv_stay_std = create_dataset_static("el-salvador", ["download_mbps_fixed", "rwi_weight", "gdp_per_capita", "pop2020", "pop_density",
                                                                                                   "pop60plus_ratio", 'tests_unique_dev_fixed'], popcol="pop2020")
df_slv_movs.to_csv("./input-dfs-static/el-salvador_movs.csv", index=False)
df_slv_stay.to_csv("./input-dfs-static/el-salvador_stay.csv", index=False)
df_slv_movs_std.to_csv("./input-dfs-static/el-salvador_movs_std.csv", index=False)
df_slv_stay_std.to_csv("./input-dfs-static/el-salvador_stay_std.csv", index=False)

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  return warn(
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


- Time:

In [11]:
# colombia
df_col_t, df_col_t_std = create_dataset_time("colombia", ["download_mbps_fixed", "rwi_weight", "gdp_per_capita", "popDANE",
                                                          "pop_density", "pop60plus_ratio", 'tests_unique_dev_fixed'], popcol="popDANE")
df_col_t.to_csv("./input-dfs-time/colombia_time.csv", index=False)
df_col_t_std.to_csv("./input-dfs-time/colombia_time_std.csv", index=False)


df_ecu_t, df_ecu_t_std = create_dataset_time("ecuador", ["download_mbps_fixed", "rwi_weight", "pop2020", "pop_density",
                                                         "pop60plus_ratio", 'tests_unique_dev_fixed'], popcol="pop2020")
df_ecu_t.to_csv("./input-dfs-time/ecuador_time.csv", index=False)
df_ecu_t_std.to_csv("./input-dfs-time/ecuador_time_std.csv", index=False)

df_slv_t, df_slv_t_std = create_dataset_time("el-salvador", ["download_mbps_fixed", "rwi_weight", "gdp_per_capita", "pop2020", 
                                                             "pop_density", "pop60plus_ratio", 'tests_unique_dev_fixed'], popcol="pop2020")

df_slv_t.to_csv("./input-dfs-time/el-salvador_time.csv", index=False)
df_slv_t_std.to_csv("./input-dfs-time/el-salvador_time_std.csv", index=False)

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
