In [1]:
import numpy as np
import pandas as pd

app_dir = '/Users/nikhilvs/repos/nyu/covid-19'
COUNTRIES = [
    'United States', 
    'India', 
    'Italy', 
    'Canada', 
    'Spain', 
    'France', 
    'United Kingdom',
    'Iraq',
    'Japan',
    'South Korea',
    'Germany',
    'Brazil'
]

In [2]:
google_mobility_raw_df = pd.read_csv(os.path.join(app_dir, 'data/raw/google', 'Global_Mobility_Report.csv'))


In [3]:
google_mobility_raw_df['date'].isnull().values.any()

False

In [4]:
MOBILITY_COLUMNS_TO_KEEP = [
    'country_region', 
    'date', 
    'retail_and_recreation_percent_change_from_baseline',
    'grocery_and_pharmacy_percent_change_from_baseline',
    'parks_percent_change_from_baseline',
    'transit_stations_percent_change_from_baseline',
    'workplaces_percent_change_from_baseline',
    'residential_percent_change_from_baseline'
]

In [5]:
countries_mobility_df = google_mobility_raw_df[google_mobility_raw_df['country_region'].isin(COUNTRIES)][MOBILITY_COLUMNS_TO_KEEP]


In [6]:
countries_mobility_df.count()

country_region                                        259077
date                                                  259077
retail_and_recreation_percent_change_from_baseline    202056
grocery_and_pharmacy_percent_change_from_baseline     195007
parks_percent_change_from_baseline                     86859
transit_stations_percent_change_from_baseline         118406
workplaces_percent_change_from_baseline               249088
residential_percent_change_from_baseline              130902
dtype: int64

In [7]:
countries_mobility_df['retail_and_recreation_percent_change_from_baseline'].fillna(0.0, inplace = True)
countries_mobility_df['grocery_and_pharmacy_percent_change_from_baseline'].fillna(0.0, inplace = True)
countries_mobility_df['parks_percent_change_from_baseline'].fillna(0.0, inplace = True)
countries_mobility_df['transit_stations_percent_change_from_baseline'].fillna(0.0, inplace = True)
countries_mobility_df['workplaces_percent_change_from_baseline'].fillna(0.0, inplace = True)
countries_mobility_df['residential_percent_change_from_baseline'].fillna(0.0, inplace = True)

In [22]:
countries_mobility_df_agg = countries_mobility_df.groupby(['country_region', 'date']).mean()
countries_mobility_df_agg.to_csv(os.path.join(app_dir, 'data/staging', 'google_mobility.csv'))

In [9]:
oxford_stringency_df = pd.read_csv(os.path.join(app_dir, 'data/raw/oxford', 'OxCGRT_latest.csv'))

In [10]:
oxford_stringency_df.count()

CountryName                              22563
CountryCode                              22563
Date                                     22563
C1_School closing                        21783
C1_Flag                                   9701
C2_Workplace closing                     21543
C2_Flag                                   8167
C3_Cancel public events                  21453
C3_Flag                                   9758
C4_Restrictions on gatherings            21447
C4_Flag                                   8419
C5_Close public transport                21382
C5_Flag                                   6354
C6_Stay at home requirements             21295
C6_Flag                                   7458
C7_Restrictions on internal movement     21587
C7_Flag                                   7853
C8_International travel controls         21702
E1_Income support                        20828
E1_Flag                                   4112
E2_Debt/contract relief                  20806
E3_Fiscal mea

In [11]:
STRINGENCY_COLUMNS_TO_KEEP = [
    'CountryName',
    'Date',
    'C1_School closing',
    # 'C1_Flag',
    'C2_Workplace closing',
    # 'C2_Flag',
    'C3_Cancel public events',
    # 'C3_Flag',
    'C4_Restrictions on gatherings',
    # 'C4_Flag',
    'C5_Close public transport',
    # 'C5_Flag',
    'C6_Stay at home requirements',
    # 'C6_Flag',
    'C7_Restrictions on internal movement',
    # 'C7_Flag',
    'C8_International travel controls',
    'E1_Income support',
    # 'E1_Flag',
    'E2_Debt/contract relief',
    # 'E3_Fiscal measures',
    # 'E4_International support',
    'H1_Public information campaigns',
    # 'H1_Flag',
    'H2_Testing policy',
    'H3_Contact tracing',
    # 'H4_Emergency investment in healthcare',
    # 'H5_Investment in vaccines',
    'StringencyIndex'
]

In [12]:
countries_stringency_df = oxford_stringency_df[oxford_stringency_df['CountryName'].isin(COUNTRIES)][STRINGENCY_COLUMNS_TO_KEEP]
countries_stringency_df['Date'] = countries_stringency_df['Date'].transform(lambda d: str(d)[0:4] + '-' + str(d)[4:6] + '-' + str(d)[6:])
countries_stringency_df.fillna(0.0, inplace = True)
countries_stringency_df.to_csv(os.path.join(app_dir, 'data/processed', 'oxford_stringency.csv'))

In [13]:
countries_stringency_df.count()

CountryName                             1632
Date                                    1632
C1_School closing                       1632
C2_Workplace closing                    1632
C3_Cancel public events                 1632
C4_Restrictions on gatherings           1632
C5_Close public transport               1632
C6_Stay at home requirements            1632
C7_Restrictions on internal movement    1632
C8_International travel controls        1632
E1_Income support                       1632
E2_Debt/contract relief                 1632
H1_Public information campaigns         1632
H2_Testing policy                       1632
H3_Contact tracing                      1632
StringencyIndex                         1632
dtype: int64

In [16]:

jhu_cases_raw_df = pd.read_csv(os.path.join(app_dir, 'data/raw/jhu', 'time_series_covid19_confirmed_global.csv'))
countries_metrics_df = jhu_cases_raw_df.drop(columns = ['Province/State', 'Lat', 'Long'])
countries_metrics_df['Country/Region'] = countries_metrics_df['Country/Region'].replace('US', 'United States')

all_countries_metrics_df = pd.DataFrame(columns = ['country', 'date', 'cases'])

for country in COUNTRIES:
    transposed_df = countries_metrics_df[countries_metrics_df['Country/Region'] == country].drop(columns = ['Country/Region']).transpose()
    num_cols = len(transposed_df.columns)
    num_rows = transposed_df.shape[0]

    if num_rows == 0 or num_cols == 0:
        continue

    value_col_name = transposed_df.columns[0]

    country_series = pd.Series([country] * num_rows)
    dates_series = pd.to_datetime(pd.Series(transposed_df.index))
    value_series = transposed_df[value_col_name].reset_index()[value_col_name]
    
    country_df = pd.DataFrame({
        'country': country_series,
        'date': dates_series,
        'cases': value_series
    })

    all_countries_metrics_df = all_countries_metrics_df.append(country_df, ignore_index = True)

all_countries_metrics_df.to_csv(os.path.join(app_dir, 'data/staging', 'jhu_cases.csv'), index = False)

In [17]:

jhu_deaths_raw_df = pd.read_csv(os.path.join(app_dir, 'data/raw/jhu', 'time_series_covid19_deaths_global.csv'))
countries_metrics_df = jhu_deaths_raw_df.drop(columns = ['Province/State', 'Lat', 'Long'])
countries_metrics_df['Country/Region'] = countries_metrics_df['Country/Region'].replace('US', 'United States')

all_countries_metrics_df = pd.DataFrame(columns = ['country', 'date', 'deaths'])

for country in COUNTRIES:
    transposed_df = countries_metrics_df[countries_metrics_df['Country/Region'] == country].drop(columns = ['Country/Region']).transpose()
    num_cols = len(transposed_df.columns)
    num_rows = transposed_df.shape[0]

    if num_rows == 0 or num_cols == 0:
        continue

    value_col_name = transposed_df.columns[0]

    country_series = pd.Series([country] * num_rows)
    dates_series = pd.to_datetime(pd.Series(transposed_df.index))
    value_series = transposed_df[value_col_name].reset_index()[value_col_name]
    
    country_df = pd.DataFrame({
        'country': country_series,
        'date': dates_series,
        'deaths': value_series
    })

    all_countries_metrics_df = all_countries_metrics_df.append(country_df, ignore_index = True)

all_countries_metrics_df.to_csv(os.path.join(app_dir, 'data/staging', 'jhu_deaths.csv'), index = False)

In [18]:

jhu_recovered_raw_df = pd.read_csv(os.path.join(app_dir, 'data/raw/jhu', 'time_series_covid19_recovered_global.csv'))
countries_metrics_df = jhu_recovered_raw_df.drop(columns = ['Province/State', 'Lat', 'Long'])
countries_metrics_df['Country/Region'] = countries_metrics_df['Country/Region'].replace('US', 'United States')

all_countries_metrics_df = pd.DataFrame(columns = ['country', 'date', 'recovered'])

for country in COUNTRIES:
    transposed_df = countries_metrics_df[countries_metrics_df['Country/Region'] == country].drop(columns = ['Country/Region']).transpose()
    num_cols = len(transposed_df.columns)
    num_rows = transposed_df.shape[0]

    if num_rows == 0 or num_cols == 0:
        continue

    value_col_name = transposed_df.columns[0]

    country_series = pd.Series([country] * num_rows)
    dates_series = pd.to_datetime(pd.Series(transposed_df.index))
    value_series = transposed_df[value_col_name].reset_index()[value_col_name]
    
    country_df = pd.DataFrame({
        'country': country_series,
        'date': dates_series,
        'recovered': value_series
    })

    all_countries_metrics_df = all_countries_metrics_df.append(country_df, ignore_index = True)

all_countries_metrics_df.to_csv(os.path.join(app_dir, 'data/staging', 'jhu_recovered.csv'), index = False)

In [19]:
google_mobility_df = pd.read_csv(os.path.join(app_dir, 'data/processed', 'google_mobility.csv'))
oxford_stringency_df = pd.read_csv(os.path.join(app_dir, 'data/processed', 'oxford_stringency.csv'))
jhu_cases_df = pd.read_csv(os.path.join(app_dir, 'data/processed', 'jhu_cases.csv'))
jhu_deaths_df = pd.read_csv(os.path.join(app_dir, 'data/processed', 'jhu_deaths.csv'))
jhu_recovered_df = pd.read_csv(os.path.join(app_dir, 'data/processed', 'jhu_recovered.csv'))

In [20]:
unified_df = google_mobility_df.merge(
    oxford_stringency_df,
    how = 'inner',
    left_on = ['country_region', 'date'],
    right_on = ['CountryName', 'Date']
).merge(
    jhu_cases_df,
    how = 'inner',
    left_on = ['country_region', 'date'],
    right_on = ['country', 'date']
).merge(
    jhu_deaths_df,
    how = 'inner',
    left_on = ['country_region', 'date'],
    right_on = ['country', 'date']
).merge(
    jhu_recovered_df,
    how = 'inner',
    left_on = ['country_region', 'date'],
    right_on = ['country', 'date']
)[[
    'country_region',
    'date',
    'retail_and_recreation_percent_change_from_baseline',
    'grocery_and_pharmacy_percent_change_from_baseline',
    'parks_percent_change_from_baseline',
    'transit_stations_percent_change_from_baseline',
    'workplaces_percent_change_from_baseline',
    'residential_percent_change_from_baseline',
    'C1_School closing',
    'C2_Workplace closing',
    'C3_Cancel public events',
    'C4_Restrictions on gatherings',
    'C5_Close public transport',
    'C6_Stay at home requirements',
    'C7_Restrictions on internal movement',
    'C8_International travel controls',
    'E1_Income support',
    'E2_Debt/contract relief',
    'H1_Public information campaigns',
    'H2_Testing policy',
    'H3_Contact tracing',
    'StringencyIndex',
    'cases',
    'deaths',
    'recovered'
]]

In [21]:
unified_df.to_csv(os.path.join(app_dir, 'data/staging', 'unified.csv'), index = False)