In [1]:
import covidcast as cvc
from datetime import date
import numpy as np
import pandas as pd
import seaborn as sns

In [22]:
TMP_DATA_DIR = '../../data'
DATA_DIR = '/home/ubuntu/data/covid'

Date range chosen for feature availability and by eyeballing https://www.google.com/search?q=us+covid+cases

### % of population with Covid-like symptoms in each location daily

In [3]:
wcli = cvc.signal('fb-survey', 'smoothed_wcli', date(2021, 3, 1), date(2021, 8, 31), 'state')

In [8]:
wcli.shape

(9384, 13)

In [9]:
wcli.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size,geo_type,data_source
0,ak,smoothed_wcli,2021-03-01,2021-03-17,16,0,0,0,0.724784,0.356963,861.0,state,fb-survey
1,al,smoothed_wcli,2021-03-01,2021-03-17,16,0,0,0,1.138372,0.173658,3902.1086,state,fb-survey
2,ar,smoothed_wcli,2021-03-01,2021-03-17,16,0,0,0,0.821564,0.177104,2677.1034,state,fb-survey
3,az,smoothed_wcli,2021-03-01,2021-03-17,16,0,0,0,0.911744,0.140218,5322.3944,state,fb-survey
4,ca,smoothed_wcli,2021-03-01,2021-03-17,16,0,0,0,0.803879,0.085038,23757.1667,state,fb-survey


In [10]:
wcli.describe()

Unnamed: 0,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size
count,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0
mean,5.358696,0.0,0.0,0.0,0.984274,0.249275,4022.70114
std,1.622494,0.0,0.0,0.0,0.470248,0.151787,3922.892059
min,4.0,0.0,0.0,0.0,0.0,0.057986,200.0
25%,5.0,0.0,0.0,0.0,0.684568,0.146317,1380.041525
50%,5.0,0.0,0.0,0.0,0.892789,0.208331,2933.00555
75%,5.0,0.0,0.0,0.0,1.176518,0.298594,4955.0128
max,16.0,0.0,0.0,0.0,3.858207,1.437439,23996.8272


Columns missing_value, missing_stderr, missing_sample_size do not vary (all observations have non-missing value, stderr, and sample_size), so drop them, along with signal, geo_type, and data_source.

In [12]:
wcli.drop(columns=['signal', 'missing_value', 'missing_stderr', 'missing_sample_size', 'geo_type', 'data_source'], inplace=True)

In [15]:
wcli.geo_value.value_counts().describe()

count     51.0
mean     184.0
std        0.0
min      184.0
25%      184.0
50%      184.0
75%      184.0
max      184.0
Name: geo_value, dtype: float64

In [16]:
wcli['geo_value'] = wcli.geo_value.str.upper()

In [18]:
wcli.rename(columns={ 'value': 'covidlike_pct' }, inplace=True)

In [68]:
wcli.head()

Unnamed: 0,geo_value,time_value,issue,lag,covidlike_pct,stderr,sample_size
0,AK,2021-03-01,2021-03-17,16,0.724784,0.356963,861.0
1,AL,2021-03-01,2021-03-17,16,1.138372,0.173658,3902.1086
2,AR,2021-03-01,2021-03-17,16,0.821564,0.177104,2677.1034
3,AZ,2021-03-01,2021-03-17,16,0.911744,0.140218,5322.3944
4,CA,2021-03-01,2021-03-17,16,0.803879,0.085038,23757.1667


In [70]:
wcli.to_csv(f'{TMP_DATA_DIR}/covidlike_pct.csv', index=False, mode='a')
!sudo mv $TMP_DATA_DIR/covidlike_pct.csv $DATA_DIR

### Number of new confirmed COVID-19 cases per 100,000 population, daily

In [25]:
cases = cvc.signal('indicator-combination', 'confirmed_7dav_incidence_prop', date(2021, 3, 1), date(2021, 8, 31), 'state')

In [27]:
cases = cases_msa
del cases_msa

In [28]:
cases.shape

(9568, 13)

In [29]:
cases.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size,geo_type,data_source
0,ak,confirmed_7dav_incidence_prop,2021-03-01,2021-09-17,200,0,5,5,17.028539,,,state,indicator-combination
1,al,confirmed_7dav_incidence_prop,2021-03-01,2021-03-11,10,0,5,5,18.445736,,,state,indicator-combination
2,ar,confirmed_7dav_incidence_prop,2021-03-01,2021-03-11,10,0,5,5,32.246722,,,state,indicator-combination
3,az,confirmed_7dav_incidence_prop,2021-03-01,2021-03-11,10,0,5,5,19.33818,,,state,indicator-combination
4,ca,confirmed_7dav_incidence_prop,2021-03-01,2021-09-17,200,0,5,5,8.701467,,,state,indicator-combination


In [30]:
cases.describe()

Unnamed: 0,lag,missing_value,missing_stderr,missing_sample_size,value
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,13.834239,0.0,5.0,5.0,16.043639
std,36.357908,0.0,0.0,0.0,16.536474
min,1.0,0.0,5.0,5.0,-19.254748
25%,3.0,0.0,5.0,5.0,5.731109
50%,3.0,0.0,5.0,5.0,11.583911
75%,4.0,0.0,5.0,5.0,20.164474
max,200.0,0.0,5.0,5.0,145.883091


Columns missing_value, missing_stderr, missing_sample_size do not vary (all observations have non-missing value, but not stderr and sample_size), so drop them, along with signal, stderr, sample_size, geo_type, data_source.

In [31]:
cases = cases[['geo_value', 'time_value', 'issue', 'lag', 'value']]

In [43]:
cases.geo_value.value_counts().describe()

count     52.0
mean     184.0
std        0.0
min      184.0
25%      184.0
50%      184.0
75%      184.0
max      184.0
Name: geo_value, dtype: float64

In [32]:
cases['geo_value'] = cases.geo_value.str.upper()

In [34]:
cases.rename(columns={ 'value': 'cases_per_100k' }, inplace=True)

In [71]:
cases.head()

Unnamed: 0,geo_value,time_value,issue,lag,cases_per_100k
0,AK,2021-03-01,2021-09-17,200,17.028539
1,AL,2021-03-01,2021-03-11,10,18.445736
2,AR,2021-03-01,2021-03-11,10,32.246722
3,AZ,2021-03-01,2021-03-11,10,19.33818
4,CA,2021-03-01,2021-09-17,200,8.701467


In [72]:
cases.to_csv(f'{TMP_DATA_DIR}/cases_per_100k.csv', index=False, mode='a')
!sudo mv $TMP_DATA_DIR/cases_per_100k.csv $DATA_DIR

### Estimated percentage of people who wore a mask most or all the time in public in the past 7 days

In [37]:
mask_pct = cvc.signal('fb-survey', 'smoothed_wwearing_mask_7d', date(2021, 3, 1), date(2021, 8, 31), 'state')

In [38]:
mask_pct.shape

(9384, 13)

In [39]:
mask_pct.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size,geo_type,data_source
0,ak,smoothed_wwearing_mask_7d,2021-03-01,2021-03-17,16,0,0,0,86.797651,1.242732,742.0,state,fb-survey
1,al,smoothed_wwearing_mask_7d,2021-03-01,2021-03-17,16,0,0,0,85.811901,0.616718,3201.0899,state,fb-survey
2,ar,smoothed_wwearing_mask_7d,2021-03-01,2021-03-17,16,0,0,0,86.941514,0.719849,2190.9745,state,fb-survey
3,az,smoothed_wwearing_mask_7d,2021-03-01,2021-03-17,16,0,0,0,90.620926,0.431496,4564.9329,state,fb-survey
4,ca,smoothed_wwearing_mask_7d,2021-03-01,2021-03-17,16,0,0,0,94.937897,0.154485,20137.1609,state,fb-survey


In [40]:
mask_pct.describe()

Unnamed: 0,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size
count,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0
mean,5.358696,0.0,0.0,0.0,63.442985,0.950789,3526.210814
std,1.622494,0.0,0.0,0.0,24.246039,0.541497,3402.599578
min,4.0,0.0,0.0,0.0,9.975303,0.154485,183.0
25%,5.0,0.0,0.0,0.0,42.273835,0.574058,1216.1785
50%,5.0,0.0,0.0,0.0,67.520953,0.819932,2609.06935
75%,5.0,0.0,0.0,0.0,85.708092,1.181569,4387.206675
max,16.0,0.0,0.0,0.0,99.28274,3.658712,20316.8214


Columns missing_value, missing_stderr, missing_sample_size do not vary (all observations have non-missing value, stderr, and sample_size), so drop them, along with signal, geo_type, and data_source.

In [41]:
mask_pct.drop(columns=['signal', 'missing_value', 'missing_stderr', 'missing_sample_size', 'geo_type', 'data_source'], inplace=True)

In [42]:
mask_pct.geo_value.value_counts().describe()

count     51.0
mean     184.0
std        0.0
min      184.0
25%      184.0
50%      184.0
75%      184.0
max      184.0
Name: geo_value, dtype: float64

In [44]:
mask_pct['geo_value'] = mask_pct.geo_value.str.upper()

In [45]:
mask_pct.rename(columns={ 'value': 'mask_pct' }, inplace=True)

In [46]:
mask_pct.head()

Unnamed: 0,geo_value,time_value,issue,lag,mask_pct,stderr,sample_size
0,AK,2021-03-01,2021-03-17,16,86.797651,1.242732,742.0
1,AL,2021-03-01,2021-03-17,16,85.811901,0.616718,3201.0899
2,AR,2021-03-01,2021-03-17,16,86.941514,0.719849,2190.9745
3,AZ,2021-03-01,2021-03-17,16,90.620926,0.431496,4564.9329
4,CA,2021-03-01,2021-03-17,16,94.937897,0.154485,20137.1609


In [47]:
mask_pct.to_csv(f'{TMP_DATA_DIR}/mask_pct.csv', index=False, mode='a', header=False)
!sudo mv $TMP_DATA_DIR/mask_pct.csv $DATA_DIR

### Estimated percentage of respondents who have already received a vaccine for COVID-19

In [51]:
vaxxed_pct = cvc.signal('fb-survey', 'smoothed_wcovid_vaccinated', date(2021, 3, 1), date(2021, 8, 31), 'state')

In [62]:
len(vaxxed_pct)

9384

In [53]:
vaxxed_pct.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size,geo_type,data_source
0,ak,smoothed_wcovid_vaccinated,2021-03-01,2021-03-17,16,0,0,0,50.664508,1.746987,819.0,state,fb-survey
1,al,smoothed_wcovid_vaccinated,2021-03-01,2021-03-17,16,0,0,0,26.766438,0.73413,3637.0975,state,fb-survey
2,ar,smoothed_wcovid_vaccinated,2021-03-01,2021-03-17,16,0,0,0,29.856103,0.910662,2525.2762,state,fb-survey
3,az,smoothed_wcovid_vaccinated,2021-03-01,2021-03-17,16,0,0,0,33.563791,0.663872,5059.4867,state,fb-survey
4,ca,smoothed_wcovid_vaccinated,2021-03-01,2021-03-17,16,0,0,0,30.726484,0.307949,22445.1667,state,fb-survey


In [54]:
vaxxed_pct.describe()

Unnamed: 0,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size
count,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0,9384.0
mean,5.358696,0.0,0.0,0.0,71.109949,0.93881,3824.472892
std,1.622494,0.0,0.0,0.0,14.733289,0.4599,3717.889997
min,4.0,0.0,0.0,0.0,24.170998,0.248197,186.0
25%,5.0,0.0,0.0,0.0,66.415695,0.607661,1312.0829
50%,5.0,0.0,0.0,0.0,74.870675,0.811923,2795.4292
75%,5.0,0.0,0.0,0.0,81.288461,1.17712,4712.221225
max,16.0,0.0,0.0,0.0,96.388247,2.997974,22750.8272


Columns missing_value, missing_stderr, missing_sample_size do not vary (all observations have non-missing value, stderr, and sample_size), so drop them, along with signal, geo_type, and data_source.

In [55]:
vaxxed_pct.drop(columns=['signal', 'missing_value', 'missing_stderr', 'missing_sample_size', 'geo_type', 'data_source'], inplace=True)

In [56]:
vaxxed_pct.geo_value.value_counts().describe()

count     51.0
mean     184.0
std        0.0
min      184.0
25%      184.0
50%      184.0
75%      184.0
max      184.0
Name: geo_value, dtype: float64

In [57]:
vaxxed_pct['geo_value'] = vaxxed_pct.geo_value.str.upper()

In [58]:
vaxxed_pct.rename(columns={ 'value': 'vaxxed_pct' }, inplace=True)

In [66]:
vaxxed_pct.head()

Unnamed: 0,geo_value,time_value,issue,lag,vaxxed_pct,stderr,sample_size
0,AK,2021-03-01,2021-03-17,16,50.664508,1.746987,819.0
1,AL,2021-03-01,2021-03-17,16,26.766438,0.73413,3637.0975
2,AR,2021-03-01,2021-03-17,16,29.856103,0.910662,2525.2762
3,AZ,2021-03-01,2021-03-17,16,33.563791,0.663872,5059.4867
4,CA,2021-03-01,2021-03-17,16,30.726484,0.307949,22445.1667


In [67]:
vaxxed_pct.to_csv(f'{TMP_DATA_DIR}/vaxxed_pct.csv', index=False, mode='a', header=False)
!sudo mv $TMP_DATA_DIR/vaxxed_pct.csv $DATA_DIR

### Covid-related Google search volume
Summed volumes of anosmia- and ageusia-related Google searches, in an arbitrary units that are normalized for overall search users, smoothed by 7-day average.

In [73]:
covid_rel_search_vols = cvc.signal('google-symptoms', 'sum_anosmia_ageusia_smoothed_search',
                                   date(2021, 3, 1), date(2021, 8, 31), 'state')



In [76]:
covid_rel_search_vols.shape

(6418, 13)

In [77]:
covid_rel_search_vols.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,missing_value,missing_stderr,missing_sample_size,value,stderr,sample_size,geo_type,data_source
0,al,sum_anosmia_ageusia_smoothed_search,2021-03-01,2021-03-25,24,0,5,5,0.348571,,,state,google-symptoms
1,ar,sum_anosmia_ageusia_smoothed_search,2021-03-01,2021-03-25,24,0,5,5,0.274286,,,state,google-symptoms
2,az,sum_anosmia_ageusia_smoothed_search,2021-03-01,2021-03-25,24,0,5,5,0.282857,,,state,google-symptoms
3,ca,sum_anosmia_ageusia_smoothed_search,2021-03-01,2021-03-25,24,0,5,5,0.244286,,,state,google-symptoms
4,co,sum_anosmia_ageusia_smoothed_search,2021-03-01,2021-03-25,24,0,5,5,0.28,,,state,google-symptoms


In [78]:
covid_rel_search_vols.describe()

Unnamed: 0,lag,missing_value,missing_stderr,missing_sample_size,value
count,6418.0,6418.0,6418.0,6418.0,6418.0
mean,8.596136,0.0,5.0,5.0,0.303798
std,6.181797,0.0,0.0,0.0,0.134923
min,3.0,0.0,5.0,5.0,0.084286
25%,4.0,0.0,5.0,5.0,0.222857
50%,4.0,0.0,5.0,5.0,0.274286
75%,15.0,0.0,5.0,5.0,0.331429
max,24.0,0.0,5.0,5.0,1.061429


Columns missing_value, missing_stderr, missing_sample_size do not vary (all observations have non-missing value, and no stderr and sample_size), so drop them, along with signal, stderr, sample_size, geo_type, and data_source.

In [80]:
covid_rel_search_vols = covid_rel_search_vols[['geo_value', 'time_value', 'issue', 'lag', 'value']]

In [84]:
state_obs_counts = covid_rel_search_vols.geo_value.value_counts()
state_obs_counts.describe()

count     43.000000
mean     149.255814
std       60.485769
min        1.000000
25%      156.000000
50%      181.000000
75%      181.000000
max      181.000000
Name: geo_value, dtype: float64

In [85]:
(state_obs_counts < 181).sum()

13

In [86]:
covid_rel_search_vols['geo_value'] = covid_rel_search_vols.geo_value.str.upper()

In [87]:
covid_rel_search_vols.rename(columns={ 'value': 'search_vol' }, inplace=True)

In [88]:
covid_rel_search_vols.head()

Unnamed: 0,geo_value,time_value,issue,lag,search_vol
0,AL,2021-03-01,2021-03-25,24,0.348571
1,AR,2021-03-01,2021-03-25,24,0.274286
2,AZ,2021-03-01,2021-03-25,24,0.282857
3,CA,2021-03-01,2021-03-25,24,0.244286
4,CO,2021-03-01,2021-03-25,24,0.28


In [89]:
covid_rel_search_vols.to_csv(f'{TMP_DATA_DIR}/covid_rel_search_vols.csv', index=False, mode='a', header=False)
!sudo mv $TMP_DATA_DIR/covid_rel_search_vols.csv $DATA_DIR