In [39]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lubridate)
registerDoMC(4)

In [40]:
fp <- '/pool001/mfzhao/'
w2020   <- read_csv(str_c(fp, 'PROCESSED_DATA/region_weather_2020.csv'))
w2019   <- read_csv(str_c(fp, 'PROCESSED_DATA/region_weather_2020.csv'))
policy  <- read_csv(str_c(fp, 'PROCESSED_DATA/policy.csv'))
sci     <- read_delim(str_c(fp, 'sci/SCI_county.tsv'),  "\t", escape_double = FALSE, trim_ws = TRUE)
nyt     <- read_csv(str_c(fp, 'nyt_covid/us-counties.csv'))
us_pop  <- read_csv('/pool001/mfzhao/geo_data/cc-est2018-alldata.csv')
usm2019 <- read_csv(str_c(fp, 'safegraph/us_mobility_2019.csv'))
usm2020 <- read_csv(str_c(fp, 'safegraph/us_mobility.csv'), 
                    col_types = cols(
                        mhdt = col_double(),
                        mnhdt = col_double(),
                        mdtfh = col_double()))

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  DATE = [34mcol_date(format = "")[39m,
  PRCP = [32mcol_double()[39m,
  TMAX = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  DATE = [34mcol_date(format = "")[39m,
  PRCP = [32mcol_double()[39m,
  TMAX = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  date = [34mcol_date(format = "")[39m,
  p1sdp = [32mcol_double()[39m,
  p2shp = [32mcol_double()[39m,
  p3rop = [32mcol_double()[39m,
  p1sdpDSS = [32mcol_double()[39m,
  p2shpDSS = [32mcol_double()[39m,
  p3ropDSS = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  user_county = [31mcol_character()[39m,
  fr_county = [31mcol_character()[39m,
  scaled_sci = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  date = [34mcol_date(format = "")[39m,
  county = [31mcol_character()[39m,
  state = [3

In [41]:
us_pop %>%
    unite('key', STATE, COUNTY, sep = '') %>%
    filter(YEAR == 11, 
           AGEGRP==0) %>% 
    group_by(key) %>%
    summarize(n = sum(TOT_POP)) -> us_pop

expand.grid(key = keys$key, date = seq.Date(as.Date('2020-01-01'), as.Date('2020-06-30'), '1 day'), 
            stringsAsFactors = F) %>%
    left_join(nyt %>% rename(key = fips)) %>%
    fill(cases, deaths) %>%
    replace_na(list(cases = 0, deaths = 0)) %>%
    select(-county, -state) %>%
    arrange(key, date) %>%
    mutate(newcases  = cases - lag(cases),
           newdeaths = deaths - lag(deaths)) %>%
    replace_na(list(newcases = 0, newdeaths = 0)) -> nyt

usm2020 %>%
    rename(key = origin_county) %>%
    filter(date >= as.Date('2020-01-01'), date < as.Date('2020-07-01')) -> usm2020

usm2020 %>%
    filter(key < '57000') %>%
    group_by(key) %>%
    summarize(mdc = mean(device_count), n = n()) %>%
    filter(n == max(n), mdc > 500)%>%
    select(key) %>%
    left_join(usm2020) -> usm2020

Joining, by = c("key", "date")

Joining, by = "key"



In [42]:
usm2020 %>%
    group_by(key) %>%
    tally() %>%
    filter(n == max(n)) %>%
    select(key) -> keys_mobility

policy %>%
    select(key) %>%
    distinct() -> keys_policy

w2020 %>%
    select(key) %>%
    distinct() %>%
    anti_join(w2020 %>% 
              filter(is.na(PRCP) | is.na(TMAX)) %>%
              select(key) %>%
              distinct()) -> keys_weather

sci %>%
    select(key = user_county) %>%
    distinct() -> keys_sci

keys_mobility %>%
    inner_join(keys_policy) %>%
    inner_join(keys_weather) %>%
    inner_join(keys_sci) -> keys

write_csv(keys, str_c(fp, 'PROCESSED_DATA/keys.csv'))

Joining, by = "key"

Joining, by = "key"

Joining, by = "key"

Joining, by = "key"



In [43]:
keys %>%
    left_join(us_pop) -> population

Joining, by = "key"



In [44]:
sci %>%
    inner_join(keys, by = c('user_county' = 'key')) %>%
    inner_join(keys, by = c('fr_county' = 'key'))  %>%
    inner_join(population, by = c('fr_county' = 'key')) %>%
    arrange(user_county, fr_county) -> sci

write_csv(sci, str_c(fp, 'PROCESSED_DATA/processed_sci.csv'))

In [45]:
sci %>%
    mutate(user_state = str_sub(user_county, 1, 2),
           fr_state   = str_sub(fr_county, 1, 2)) %>%
    group_by(user_county) %>%
    mutate(w = ifelse(user_state == fr_state, 0, scaled_sci * n),
           w = w/sum(w)) %>%
    select(key = user_county, fr_county, w) %>%
    spread(key = fr_county, value = w) %>%
    ungroup(key) %>%
    arrange(key) %>%
    select(-key) %>%
    as.matrix() -> stateWM

rownames(stateWM) <- keys$key

write_rds(stateWM, str_c(fp, 'PROCESSED_DATA/stateWM.RDS'))

In [46]:
sci %>%
    group_by(user_county) %>%
    mutate(w = ifelse(user_county == fr_county, 0, scaled_sci * n),
           w = w/sum(w)) %>%
    select(key = user_county, fr_county, w) %>%
    spread(key = fr_county, value = w) %>%
    ungroup(key) %>%
    arrange(key) %>%
    select(-key) %>%
    as.matrix() -> WM

rownames(WM) <- keys$key

write_rds(WM, str_c(fp, 'PROCESSED_DATA/WM.RDS'))

In [47]:
sci %>%
    mutate(w = scaled_sci * n,
           diff_state = as.numeric(str_sub(user_county, 1, 2) != str_sub(fr_county, 1, 2)),
           diff_county = as.numeric(user_county != fr_county)) %>%
    group_by(user_county) %>%
    summarize(all = sum(w), 
              diff_county = sum(w * diff_county),
              diff_state = sum(w * diff_state)) %>%
    mutate(dcp = diff_county/all,
           dsp = diff_state/all) %>%
    left_join(us_pop, by = c('user_county' = 'key')) %>%
    summarize(dcp = sum(dcp * n/sum(n)),
              dsp = sum(dsp * n/sum(n))) %>%
    mutate(scp = 1 - dcp)

dcp,dsp,scp
<dbl>,<dbl>,<dbl>
0.625305,0.3447778,0.374695


In [48]:
usm2019 %>%
    select(key = origin_county,
           date,
           pchd.2019 = pchd,
           mcbgv.2019 = mcbgv,
           plt1hafh.2019 = plt1hafh,
           ppthgt75.2019 = ppthgt75,
           plt2kmt.2019  = plt2kmt) %>%
    mutate(month = month(date),
           day   = day(date)) %>%
    select(-date) -> usm2019

In [49]:
w2020 %>%
    mutate(month = month(DATE),
           day   = date(DATE)) %>%
    left_join(w2019 %>% 
              mutate(month = month(DATE),
                     day   = date(DATE)) %>%
              select(key, month, day, PRCP.2019 = PRCP, TMAX.2019 = TMAX)) %>%
    select(key,
           date = DATE,
           PRCP,
           TMAX,
           PRCP.2019,
           TMAX.2019) -> weather

Joining, by = c("key", "month", "day")



In [50]:
usm2020 %>%
    inner_join(keys) %>%
    mutate(month = month(date),
           day   = day(date)) %>%
    select(-mhdt, -mnhdt, -mdtfh, -device_count) %>%
    left_join(usm2019) %>%
    mutate(pnchd      = 1 - pchd,
           pgt1hafh   = 1 - plt1hafh,
           ppthlt75   = 1 - ppthgt75,
           pgt2kmt    = 1 - plt2kmt,
           dmcbgv     = mcbgv - mcbgv.2019,
           dpnchd     = pchd.2019 - pchd,
           dpgt1hafh  = plt1hafh.2019 - plt1hafh,
           dppthlt75  = ppthgt75.2019 - ppthgt75,
           dpgt2kmt   = plt2kmt.2019 - plt2kmt,
           pcmcbgv    = dmcbgv/mcbgv.2019,
           pcpnchd    = dpnchd/(1 - pchd.2019),
           pcpgt1hafh = dpgt1hafh/(1 - plt1hafh.2019),
           pcppthlt75 = dppthlt75/(1 - ppthgt75.2019),
           pcpgt2kmt  = dpgt2kmt/(1 - plt2kmt.2019),
           cluster    = str_sub(key, 1, 2)) %>%
    select(-matches('2019'), 
           -month, 
           -day,
           -pchd,
           -plt1hafh, 
           -ppthgt75,
           -plt2kmt) %>%
    left_join(us_pop) %>%
    left_join(nyt) %>%
    inner_join(policy) %>%
    inner_join(weather) %>%
    arrange(date, key) -> df

Joining, by = "key"

Joining, by = c("key", "month", "day")

Joining, by = "key"

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")



In [51]:
weightedAlters <- function(df, wm, ...) {
    df %>% 
        select(date, key, ...) %>%
        spread(key = key, value = ...) %>%
        ungroup() %>%
        select(-date) %>%
        as.matrix() -> txn_data
    
    df %>%
        ungroup() %>%
        select(date) %>%
        distinct() %>%
        arrange(date) -> dates
    
    outMatrix <- tcrossprod(txn_data, wm)
    colnames(outMatrix) <- colnames(txn_data)
    
    data.frame(dates, outMatrix) %>%
        gather(key = 'key', value = 'value', -date) %>%
        arrange(date, key) %>%
        select(-date, -key) -> out_df
    return(out_df$value)
}

In [52]:
nquantiles <- 10

df %>%
    ungroup() %>%
    select(date, key, PRCP, TMAX) %>%
    mutate(PRCP = ifelse(PRCP == 0, NA, PRCP),
           pq = ifelse(is.na(PRCP), 0, ntile(PRCP, nquantiles)), 
           tq = ntile(TMAX, nquantiles)) -> weather_quantiles

wq_ind <- function(thres) {
    weather_quantiles %>%
        transmute(tempname1 = as.numeric(pq >= thres),
                  tempname2 = as.numeric(tq >= thres)) -> temp
    colnames(temp) <- c(str_c('prcp', str_pad(thres, 2, pad = '0')), str_c('tmax', str_pad(thres, 2, pad = '0'))) 
    return(temp)
}

wq_inds <- foreach(i = 1:nquantiles, .combine = cbind) %dopar% wq_ind(i)
wq_colnames <- c(str_c('prcp', str_pad(1:nquantiles, 2, pad = '0')), str_c('tmax', str_pad(1:nquantiles, 2, pad = '0')))

df %>%
    bind_cols(wq_inds[wq_colnames] %>% select(-tmax01)) -> df

In [53]:
colnames(df)

In [54]:
df %>%
    select(-date, -key, -n, -cluster, -PRCP, -TMAX, -cases, -deaths, -newcases, -newdeaths) %>%
    colnames() -> cols_to_alterize

stalters <- foreach(i = 1:length(cols_to_alterize), .combine = cbind) %dopar% 
    weightedAlters(df, stateWM, cols_to_alterize[i])

colnames(stalters) <- str_c('stalter_', cols_to_alterize)

alters <- foreach(i = 1:length(cols_to_alterize), .combine = cbind) %dopar% 
    weightedAlters(df, WM, cols_to_alterize[i])

colnames(alters) <- str_c('alter_', cols_to_alterize)

In [55]:
alters

alter_mcbgv,alter_pnchd,alter_pgt1hafh,alter_ppthlt75,alter_pgt2kmt,alter_dmcbgv,alter_dpnchd,alter_dpgt1hafh,alter_dppthlt75,alter_dpgt2kmt,⋯,alter_prcp10,alter_tmax02,alter_tmax03,alter_tmax04,alter_tmax05,alter_tmax06,alter_tmax07,alter_tmax08,alter_tmax09,alter_tmax10
1.998818,0.7047019,0.5485605,0.4141353,0.5630406,0.2423616,0.10923403,0.10120806,0.05274232,0.09212496,⋯,0.0022245137,0.9648748,0.9061953,0.8239653,0.49851349,0.04211042,0.02185548,0.008898822,1.151914e-04,0
2.014065,0.7014929,0.5431404,0.4100810,0.5574901,0.2440773,0.10399833,0.09396900,0.04554050,0.08466238,⋯,0.0023849651,0.9480845,0.8663424,0.7543050,0.47068713,0.05930290,0.03248584,0.013761769,1.437627e-04,0
2.013181,0.7043634,0.5488217,0.4164940,0.5598970,0.2413669,0.10382511,0.09529413,0.04728147,0.08732241,⋯,0.0015241326,0.9738966,0.9194807,0.8620717,0.66319389,0.09229663,0.04831229,0.023829338,2.194432e-04,0
2.027810,0.7121535,0.5599532,0.4199651,0.5671353,0.1983817,0.09432794,0.08831763,0.04461387,0.07773661,⋯,0.0011750046,0.9718485,0.9257636,0.8615122,0.11142473,0.02358802,0.01272191,0.005017031,6.117472e-05,0
2.033342,0.7092890,0.5554161,0.4111104,0.5627014,0.2162743,0.10120651,0.09566798,0.04970057,0.08537299,⋯,0.0012991760,0.9746632,0.9342707,0.8309971,0.10997040,0.03149994,0.01675844,0.006240214,8.276371e-05,0
2.001196,0.7049243,0.5511940,0.4243884,0.5632688,0.2481794,0.10570836,0.09675422,0.04312525,0.09039146,⋯,0.0008828710,0.9713340,0.9249730,0.8741380,0.58540525,0.05481671,0.02952599,0.013313115,3.876399e-05,0
1.956986,0.7060130,0.5532295,0.4241607,0.5603099,0.2320824,0.10541741,0.09875439,0.05490099,0.09495532,⋯,0.0013389936,0.9743532,0.9254166,0.8695817,0.65903147,0.04488536,0.02249674,0.011277561,8.940125e-05,0
2.018302,0.7084706,0.5532078,0.4090100,0.5591623,0.2395901,0.10697019,0.09830839,0.05037326,0.08631047,⋯,0.0017753205,0.9668814,0.9068292,0.7975690,0.15004191,0.04145702,0.02207095,0.008632811,8.157989e-05,0
2.042101,0.7116461,0.5539357,0.4158316,0.5732945,0.2550841,0.10625734,0.09239597,0.03609919,0.08361879,⋯,0.0012809181,0.9792393,0.9380333,0.8897423,0.20012575,0.04125946,0.01609283,0.007366657,9.134664e-05,0
2.027172,0.7085310,0.5505440,0.4062444,0.5594610,0.2346636,0.10786228,0.09634226,0.05254274,0.08373962,⋯,0.0016349438,0.9676176,0.9142492,0.7121781,0.10219243,0.03479562,0.01979396,0.008067497,9.604313e-05,0


In [56]:
df %>%
    bind_cols(as.data.frame(stalters)) %>%
    bind_cols(as.data.frame(alters)) %>%
    mutate(log_mcbgv    = log(mcbgv),
           log_pnchd    = log(pnchd),
           log_pgt1hafh = log(pgt1hafh),
           log_ppthlt75 = log(ppthlt75),
           log_pgt2kmt  = log(pgt2kmt),
           log_stalter_mcbgv    = log(stalter_mcbgv),
           log_stalter_pnchd    = log(stalter_pnchd),
           log_stalter_pgt1hafh = log(stalter_pgt1hafh),
           log_stalter_ppthlt75 = log(stalter_ppthlt75),
           log_stalter_pgt2kmt  = log(stalter_pgt2kmt),
           log_alter_mcbgv    = log(alter_mcbgv),
           log_alter_pnchd    = log(alter_pnchd),
           log_alter_pgt1hafh = log(alter_pgt1hafh),
           log_alter_ppthlt75 = log(alter_ppthlt75),
           log_alter_pgt2kmt  = log(alter_pgt2kmt)) -> df

In [57]:
write_rds(df, str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))