In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
from matplotlib.patches import Rectangle
sns.set()

# Introduction <a id='intro'></a>

This notebook uses a variety of different COVID-19 related datasets to explore the behavior
of the multiple time series'. This notebook also creates new features that attempt to encapsulate the
time dependent (and time delayed) nature of the problem; these will be used during the model creation
project which makes time dependent forecasting models. 


# Table of contents

## [Function definitions](#generalfunctions)

## [Data](#imports)

## [Exploratory Data Analysis](#EDA)

## [Feature production](#newfeatures)

## Function definitions <a id='generalfunctions'></a>

In [10]:
def append_rolling_values(df, features, roll_widths):
    new_feature_df_list = []
    for window in roll_widths:
        # order the dataframe so date is index, backfill in the first roll_width values 
        rollmean = pd.DataFrame(df.groupby(by='location').rolling(window).mean().fillna(value=0.))
#         rollstd = pd.DataFrame(df.groupby(by='location').rolling(window).std().fillna(value=0.))    
#         new_features = pd.concat((rollmean, rollstd), axis=1)
        new_features = rollmean
        new_cols = features +'_rolling_mean_' + str(window)
#         rsind = features +'_rolling_std_' + str(window)
#         new_cols = rmind.append(rsind)
        new_features.columns = new_cols
        new_feature_df_list.append(new_features)
    return new_feature_df_list

def tsplot(data, roll_width, **kw):
    rollmean = datatmp.rolling(roll_width).mean().fillna(method='backfill').values.ravel()
    rollstd  = datatmp.rolling(roll_width).std().fillna(method='backfill').values.ravel()
    cis = (rollmean - rollstd, rollmean + rollstd)
    fig, ax = plt.subplots()
    ax.fill_between(range(len(datatmp)), cis[0], cis[1], alpha=0.5)
    ax.plot(range(len(datatmp)), rollmean, color='k', **kw)
    return ax

## Data <a id='imports'></a>

In [3]:
data = pd.read_csv('data.csv', index_col=0)
data.sample(5)

Unnamed: 0,location,date,active,new_deaths,total_deaths_per_million,new_deaths_per_million,total_tests_per_thousand,new_tests_per_thousand,tests_units,c1_school_closing,...,penalty_missing_flag,per100k_missing_flag,testsPer100k_missing_flag,n_cases_missing_flag,new_tests_average_missing_flag,n_deaths_missing_flag,n_recovered_missing_flag,n_tests_missing_flag,days_since_missing_flag,time_index_missing_flag
8497,Mauritania,2020-02-24 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Missing,0.0,...,True,True,True,False,True,False,False,True,False,False
734,Austria,2020-03-04 00:00:00,0.0,0.0,0.0,0.0,0.357,0.052,units unclear,0.0,...,False,False,False,False,False,False,False,False,False,False
13307,Tunisia,2020-02-10 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,Missing,0.0,...,True,True,True,False,True,False,False,True,False,False
4784,Estonia,2020-04-03 00:00:00,901.0,6.0,8.292,4.523,15.227,1.178,units unclear,3.0,...,False,False,False,False,False,False,False,False,False,False
5470,Germany,2020-04-19 00:00:00,52598.0,184.0,51.251,2.196,20.94,0.0,Missing,3.0,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# # key words account for multilevel indices
# data = pd.read_csv('data.csv', index_col=[0,1], header=[0,1])
# data.sample(5)

## Feature production <a id='newfeatures'></a>

function ```append_rolling_values``` is not working. Need to compute rolling averages for each
countries time series' individually but want to store them in the multi index DataFrame. 

Before interpolation and backfilling, I used to prune countries which did not have cases prior
to responses (i.e. "early responders" were not included)
To make my life easier, I'm only taking data which had cases before all government mandates so the rates before and after are well defined. We can think of these as being "late responders"

In [9]:
flag_columns = data.columns[data.columns.str.contains('flag')]

In [6]:
# features = data.iloc[:,2:].columns.difference(flag_columns.tolist() + ['tests_units', 'time_index', 'days_since','population'])
features = data.columns.difference(['date','location','tests_units', 'time_index', 'days_since','population'])
features

Index(['active', 'active_missing_flag', 'c1_flag', 'c1_flag_missing_flag',
       'c1_school_closing', 'c1_school_closing_missing_flag', 'c2_flag',
       'c2_flag_missing_flag', 'c2_workplace_closing',
       'c2_workplace_closing_missing_flag', 'c3_cancel_public_events',
       'c3_cancel_public_events_missing_flag', 'c3_flag',
       'c3_flag_missing_flag', 'c4_flag', 'c4_flag_missing_flag',
       'c4_restrictions_on_gatherings',
       'c4_restrictions_on_gatherings_missing_flag',
       'c5_close_public_transport', 'c5_close_public_transport_missing_flag',
       'c5_flag', 'c5_flag_missing_flag', 'c6_flag', 'c6_flag_missing_flag',
       'c6_stay_at_home_requirements',
       'c6_stay_at_home_requirements_missing_flag', 'c7_flag',
       'c7_flag_missing_flag', 'c7_restrictions_on_internal_movement',
       'c7_restrictions_on_internal_movement_missing_flag',
       'c8_international_travel_controls',
       'c8_international_travel_controls_missing_flag',
       'days_since_mis

In [7]:
features_groupby = features.tolist() + ['location']
roll_widths = [2, 3, 5, 7, 14]
datatmp = data.loc[:, features_groupby]

In [11]:
new_feature_df_list = append_rolling_values(datatmp, features, roll_widths)

In [12]:
data_new_features = pd.concat((data, pd.concat(new_feature_df_list, axis=1).reset_index(drop=True)), axis=1)

In [13]:
data_new_features.to_csv('modeling_data.csv')

In [14]:
data_new_features

Unnamed: 0,location,date,active,new_deaths,total_deaths_per_million,new_deaths_per_million,total_tests_per_thousand,new_tests_per_thousand,tests_units,c1_school_closing,...,stringency_index_for_display_missing_flag_rolling_mean_14,stringency_index_missing_flag_rolling_mean_14,testsPer100k_rolling_mean_14,testsPer100k_missing_flag_rolling_mean_14,tests_units_missing_flag_rolling_mean_14,time_index_missing_flag_rolling_mean_14,total_deaths_per_million_rolling_mean_14,total_deaths_per_million_missing_flag_rolling_mean_14,total_tests_per_thousand_rolling_mean_14,total_tests_per_thousand_missing_flag_rolling_mean_14
0,Afghanistan,2019-12-31 00:00:00,0.0,0.0,0.000,0.0,0.0,0.0,Missing,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.0
1,Afghanistan,2020-01-01 00:00:00,0.0,0.0,0.000,0.0,0.0,0.0,Missing,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.0
2,Afghanistan,2020-01-02 00:00:00,0.0,0.0,0.000,0.0,0.0,0.0,Missing,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.0
3,Afghanistan,2020-01-03 00:00:00,0.0,0.0,0.000,0.0,0.0,0.0,Missing,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.0
4,Afghanistan,2020-01-04 00:00:00,0.0,0.0,0.000,0.0,0.0,0.0,Missing,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15003,Zimbabwe,2020-05-08 00:00:00,25.0,0.0,0.202,0.0,0.0,0.0,Missing,3.0,...,0.000000,0.285714,55.507143,0.357143,0.0,0.0,0.202,1.0,0.0,1.0
15004,Zimbabwe,2020-05-09 00:00:00,21.0,0.0,0.202,0.0,0.0,0.0,Missing,3.0,...,0.000000,0.357143,57.057143,0.428571,0.0,0.0,0.202,1.0,0.0,1.0
15005,Zimbabwe,2020-05-10 00:00:00,22.0,0.0,0.202,0.0,0.0,0.0,Missing,3.0,...,0.000000,0.428571,58.450000,0.500000,0.0,0.0,0.202,1.0,0.0,1.0
15006,Zimbabwe,2020-05-11 00:00:00,23.0,0.0,0.202,0.0,0.0,0.0,Missing,3.0,...,0.071429,0.500000,59.842857,0.571429,0.0,0.0,0.202,1.0,0.0,1.0


## Exploratory Data Analysis<a id='EDA'></a>
Ideas for the inclusion or creation of new columns.

Moving averages
fourier
signal
flags for lots of different things

hardest hit countries

days since

extrapolated, actual, interpolated

which dataset it came from

humans view, interpret and forecast things in a way which are not available to robots. 
data driven, time dependent manner of modeling. Really trying to encapsulate the time dependence. 

In [None]:
append_rolling_values(data, features, roll_widths)

### USA COVID data

In [None]:
first_response_dates = start_end_df.min(axis=1).sort_index()
first_response_dates.head(10)

first_case_dates = test_multiindex_df.reset_index(level=1).groupby(level=0).Date.min().sort_index()
first_case_dates.head(10)

dates_with_test_data = test_multiindex_df.tests_cumulative.dropna()
dates_with_test_data.head()

min_testing_dates = test_multiindex_df.tests_cumulative.dropna().reset_index(level=1).groupby(level=0).Date.min()

first_testing_dates = test_multiindex_df.tests_cumulative.dropna().reset_index(level=1).groupby(level=0).Date.min()
last_testing_dates = test_multiindex_df.tests_cumulative.dropna().reset_index(level=1).groupby(level=0).Date.max()

first_testing_dates.reset_index()

# convert entire dataframe to index so it can be used to slice testing data, dataframe
first_tmp =  first_testing_dates.reset_index().set_index(['Country','Date'])
last_tmp =  last_testing_dates.reset_index().set_index(['Country','Date'])
first_tmp.head()

test_min = test_multiindex_df.loc[first_tmp.index, :]
test_max = test_multiindex_df.loc[last_tmp.index, :]

# reset index so we can subtract datetime variables.
test_max_reset = test_max.reset_index(level=1)
test_min_reset = test_min.reset_index(level=1)
time_differential = (test_max_reset.Date - test_min_reset.Date).dt.days
testing_rates = np.log(test_max_reset.tests_cumulative / test_min_reset.tests_cumulative)# / time_intervals


test_final_test_initial_time_intervals = (test_max_reset.Date - test_min_reset.Date).dt.days

case_response_differential = (first_case_dates-first_response_dates).dt.days

late_response = case_response_differential < 0
late_response

In [None]:
states_to_inspect = ['Michigan', 'Georgia', 'New York', 'Texas']

dead=us_deaths[us_deaths['Province_State'].isin(states_to_inspect)].groupby(by='Province_State').sum()
confirmed=us_cases[us_cases['Province_State'].isin(states_to_inspect)].groupby(by='Province_State').sum()
confirmed.head()


since_first_case_normalized_u = u.replace(to_replace=[0,0.], value=np.nan)
since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:].values
since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:] / since_first_case_normalized_u.loc[(states_to_inspect,'Dead'), :].iloc[:,6:]
since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:].apply(np.log10).transpose().plot()
time_series_df = since_first_case_normalized_u#.iloc[:, 6:]
death_rate_df = since_first_case_normalized_u.loc[(states_to_inspect,'Dead'), :].iloc[:,6:].copy()
death_rate_normalized = 100 * since_first_case_normalized_u.loc[(states_to_inspect,'Dead'), :].iloc[:,6:].values / since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:].values
death_rate_df.loc[:, :] = death_rate_normalized


since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:].values

since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:] / since_first_case_normalized_u.loc[(states_to_inspect,'Dead'), :].iloc[:,6:]

since_first_case_normalized_u.loc[(states_to_inspect,'Confirmed'), :].iloc[:,6:].apply(np.log10).transpose().plot()

time_series_df = since_first_case_normalized_u#.iloc[:, 6:]


In [None]:
first_case_dates.astype('category').cat.codes.plot.hist(bins=50)

In [None]:
pd.concat(new_feature_df_list,ignore_index=False).sort_index(axis=1)

In [None]:
fig = plt.figure(figsize=(10,10), dpi=200)
death_rate_df.transpose().plot().legend(bbox_to_anchor=(1, 1))
_ = plt.xlabel('Date')
_ = plt.ylabel('Death Rate (%)')
plt.grid(True, axis='both')
plt.title('Death rate by state')
plt.savefig('death_rate_NY_MI_GA.png', bbox_inches='tight')

In [None]:
fig, (ax,ax2) = plt.subplots(1, 2, sharey=True,  figsize=(20,5), dpi=200)
confirmed.loc[:, '2/21/20':].transpose().plot(ax=ax).legend(bbox_to_anchor=(0.2, 1))
dead.loc[:, '2/21/20':].transpose().plot(ax=ax2).legend(bbox_to_anchor=(0.2, 1))
ax.set_yscale('log')
ax2.set_yscale('log')
ax.set_title('Number of confirmed cases vs. time')
ax2.set_title('Number of diseased vs. time')
ax.grid(True, axis='both')
ax2.grid(True, axis='both')
plt.savefig('cases_vs_dead_comparison_GA_NY_MI.png', bbox_inches='tight')

In [None]:
def top_5_counties(state_df, state_name):
    state = state_df[(state_df.Province_State==state_name)]
    state = state.drop(columns=['UID','iso2','iso3','code3','FIPS','Country_Region','Lat','Long_','Combined_Key','Province_State'])
    top5_counties = state.groupby(by='Admin2').sum().sum(axis=1).sort_values(ascending=False)[:5].index.tolist()
    state_info = state[state.Admin2.isin(top5_counties)].set_index('Admin2').transpose()
    state_info.columns.name = 'County'
    return state_info

### Global COVID data

In [None]:
global_recovered_dates_only = global_recovered.set_index('Country/Region').loc[:, '1/22/20':].groupby(level=0).sum()
global_confirmed_dates_only = global_confirmed.set_index('Country/Region').loc[:, '1/22/20':].groupby(level=0).sum()
global_dead_dates_only = global_dead.set_index('Country/Region').loc[:, '1/22/20':].groupby(level=0).sum()

global_dead['type']='Dead'
global_confirmed['type']='Confirmed'
global_recovered['type']='Recovered'

dead=global_dead[global_dead['Country/Region'].isin(['Germany', 'Italy', 'US'])].set_index('Country/Region').loc[:,'1/22/20':]#.iloc[:, 4:].transpose().columns
confirmed=global_confirmed[global_confirmed['Country/Region'].isin(['Germany', 'Italy', 'US'])].set_index('Country/Region').loc[:,'1/22/20':]#.iloc[:, 4:].transpose().columns

global_dead = global_dead.sort_index(axis=1)
global_confirmed = global_confirmed.sort_index(axis=1)
global_recovered = global_recovered.sort_index(axis=1)

skr = global_confirmed.groupby('Country/Region').sum().iloc[143, :].loc['1/22/20':'4/28/20']
skr.head()

top10 = global_confirmed.groupby('Country/Region').sum().loc[:, '1/22/20':'4/28/20'].sort_values(by='4/28/20').iloc[-10:, :]
skr = global_confirmed.groupby('Country/Region').sum().loc['Korea, South', '1/22/20':'4/28/20']

top10_and_south_korea = pd.concat((top10, skr.to_frame(name='South Korea').transpose()),axis=0).sort_index()

fig, ax = plt.subplots(figsize=(10,10))
for i, country_time_series in enumerate(top10_and_south_korea.replace(to_replace=[0,0.], value=np.nan).values):
    nan_count = np.sum(np.isnan(country_time_series))
    days_since_first = np.roll(country_time_series, -nan_count)
    plt.plot(days_since_first, label=top10_and_south_korea.index[i])
    
plt.legend()
plt.yscale('log')
plt.show()

global_dead_dates_only

dsum = global_dead_dates_only.sum()
csum = global_confirmed_dates_only.sum()
drsum = 100*dsum/csum
drsum.plot()
_ = plt.xlabel('Date')
_ = plt.ylabel('Death Rate (%)')
_ = plt.title('Average global death rate vs. time')
plt.grid(True, axis='both')
plt.savefig('death_rate_global.png', bbox_inches='tight')

In [None]:
first_case_dates = case_df.reset_index().set_index(['Country','date']).total_cases.replace(
                           to_replace=0,value=np.nan).dropna().reset_index(level=1).groupby(level=0).date.min()

first_response_dates = response_df.min(axis=1)
tmp = response_df.copy()
dt = pd.DataFrame(np.tile(first_case_dates.values.reshape(-1,1),(1, response_df.shape[1])))
diff_df = tmp - np.tile(first_case_dates.values.reshape(-1,1),(1, response_df.shape[1]))
num_miss=diff_df.where(diff_df > pd.Timedelta(days=0)).isna().sum(1).sort_values(ascending=False)
countries_with_cases_before_responses = num_miss.where(num_miss==0).dropna().index

Just using the endpoints of each interval is not going to work as well, because if the endpoints represent outliers then they
will not capture the overall trend. Therefore, I will do the following: average the two intervals before and after the quarantine measure (average the cases/((1M people)(100k tests)) and then compare the averages with the value at the quarantine date. I believe this is fair because it's being applied equally to both intervals.


In [None]:

country_list = []
slice_list = []

for j, (country, country_df) in enumerate(all_responses.groupby(level=0)):
    active_dates = country_df.replace(to_replace=0., value=np.nan)
    country_list += [country]
    before_list = []
    after_list = []
    for i, single_response in enumerate(active_dates.columns):
        effective_range = active_dates[single_response].dropna(axis=0)
        before = effective_range.reset_index().Date.min()
#         after = effective_range.reset_index().Date.max()
        slice_list += [before]   
        
enacted_ended_df = pd.DataFrame(np.array(slice_list).reshape(len(country_list), -1), index=country_list, columns=all_responses.columns)

all_responses = response_df.iloc[:, [0, 1, 2, 3, 5, 6]]
country_list = []
minmax_list = []
for j, (country, country_df) in enumerate(all_responses.groupby(level=0)):
    active_dates = country_df.replace(to_replace=0., value=np.nan)
    country_list += [country]
    for i, single_response in enumerate(active_dates.columns):
        effective_range = active_dates[single_response].dropna(axis=0)
        before = effective_range.reset_index().Date.min()
        after = effective_range.reset_index().Date.max()
        minmax_list += [before, after]   

start_end_columns = np.array([[x+'_start', x+'_end'] for x in all_responses.columns.tolist()]).ravel()
start_end_df = pd.DataFrame(np.array(minmax_list).reshape(len(country_list), -1), index=country_list, columns=start_end_columns)
start_end_filtered_df = start_end_df.drop(columns=['Close_public_transport_start','Close_public_transport_end']).dropna(axis=0)
filtered_countries = start_end_filtered_df.index
enacted_ended_filtered_df = enacted_ended_df.drop(columns=['Close_public_transport']).loc[filtered_countries, :]
start_end_filtered_df = start_end_df.drop(columns=['Close_public_transport_start','Close_public_transport_end']).dropna(axis=0)
filtered_countries = start_end_filtered_df.index
enacted_ended_filtered_df = enacted_ended_df.drop(columns=['Close_public_transport']).loc[filtered_countries, :]

In [None]:
data = case_multiindex_df.join(test_multiindex_df, lsuffix='_x', rsuffix='_y').sort_index(axis=1, ascending=False)

To ensure that total cases is a cumulative variable, replace zeros with np.nan and then backwards interpolate
Growth rate calculations require values greater than zero, so remove all dates where there are zero confirmed cases, per country.

In [None]:
# Normalize the time series, fill in with missing values with nan. 
data = data.reindex(pd.MultiIndex.from_product([data.index.levels[0], 
                    data.index.get_level_values(1).unique().sort_values()], names=['Country', 'Date']), fill_value=np.nan)

# Don't use zeros this messes things up.
data.loc[:, 'total_cases'] = data.loc[:, 'total_cases'].replace(to_replace=[0,0.], value=np.nan)
# instantiate with copy so that we can iterate over DataFrame groupby
data.loc[:, 'total_cases_interpolated'] = data.loc[:, 'total_cases'].copy()
data.loc[:, 'tests_cumulative_interpolated'] = data.loc[:, 'tests_cumulative'].copy()

for country, country_df in data.groupby(level=0):
    data.loc[country, 'total_cases_interpolated'] = country_df.loc[:, 'total_cases'].interpolate(limit_direction='backward').values
    data.loc[country, 'tests_cumulative_interpolated'] = country_df.loc[:, 'tests_cumulative'].interpolate(limit_direction='backward').values
    data.loc[country, 'population'] = country_df.loc[:, 'population'].fillna(method='backfill')

data.loc[:, 'cases_per_1M_people_per_100k_tests'] = (data.total_cases_interpolated / ((data.population/1000000.) * (data.tests_cumulative_interpolated))).values
data.loc[:, 'cases_per_1M_people'] = (data.total_cases_interpolated / ((data.population/1000000.))).values


data.loc[:, 'cumulative_normalized_case_test_ratio'] = (data.total_cases_interpolated / ((data.population/1000000.) * (data.tests_cumulative_interpolated))).cumsum().apply(np.log)

before_minus_after = response_multiindex_df.applymap(multiindex_response_date_to_average_rates).replace(to_replace=0., value=np.nan).sort_index()

before_minus_after_residual_values =  before_minus_after.values - np.tile(before_minus_after.mean(1).values.reshape(-1,1), (1, 5))
before_minus_after_residual_df = pd.DataFrame(before_minus_after_residual_values.reshape(-1, 5), columns=before_minus_after.columns, index=before_minus_after.index)
before_minus_after_residual_df.head()

data.loc[:, 'cumulative_normalized_case_test_ratio'] = (data.total_cases_interpolated / data.tests_cumulative_interpolated).cumsum()