In [1]:
library(readr)
library(tidyr)
library(stringr)
library(dplyr)
library(lubridate)
library(USAboundaries)
library(sf)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘lubridate’


The following objects are masked from ‘package:dplyr’:

    intersect, setdiff, union


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union


Linking to GEOS 3.5.2, GDAL 2.4.2, PROJ 4.8.0



In [2]:
fp      <- '/pool001/mfzhao/'
keys    <- read_csv(str_c(fp, 'PROCESSED_DATA/keys.csv'))
policy  <- read_csv(str_c(fp, 'PROCESSED_DATA/policy.csv'))
df      <- read_csv(str_c(fp, 'safegraph/dyadic2020.csv'))
us_pop  <- read_csv(str_c(fp, 'geo_data/cc-est2018-alldata.csv'))
panel   <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))
usm     <- read_csv(str_c(fp, 'safegraph/us_mobility.csv'), 
                    col_types = cols(
                        mhdt = col_double(),
                        mnhdt = col_double(),
                        mdtfh = col_double()))

Parsed with column specification:
cols(
  key = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  date = [34mcol_date(format = "")[39m,
  p1sdp = [32mcol_double()[39m,
  p2shp = [32mcol_double()[39m,
  p3rop = [32mcol_double()[39m,
  p1sdpDSS = [32mcol_double()[39m,
  p2shpDSS = [32mcol_double()[39m,
  p3ropDSS = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  date = [34mcol_date(format = "")[39m,
  origin_county = [32mcol_double()[39m,
  destination_county = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  .default = col_double(),
  SUMLEV = [31mcol_character()[39m,
  STATE = [31mcol_character()[39m,
  COUNTY = [31mcol_character()[39m,
  STNAME = [31mcol_character()[39m,
  CTYNAME = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [3]:
county_sf <- us_counties()

as.data.frame(county_sf) %>%
    select(-geometry) %>%
    mutate(L3 = row_number(),
           key = str_c(statefp, countyfp)) %>%
    left_join(as.data.frame(st_coordinates(county_sf))) %>%
    select(key, X, Y) -> coords

coords %>%
    inner_join(coords, by = c('X', 'Y')) %>%
    filter(key.x != key.y) %>%
    select(origin_county = key.x,
           destination_county = key.y) %>%
    distinct() %>%
    mutate(bordering = 1) %>%
    arrange(origin_county, destination_county) -> bordering

county_sf %>%
    st_centroid() %>%
    as.data.frame() %>%
    mutate(key = str_c(statefp, countyfp)) %>%
    select(key, geometry) %>%
    arrange(key) -> centroids

distm <- data.frame(st_distance(centroids$geometry, centroids$geometry))
colnames(distm) <- centroids$key

distm %>%
    mutate(origin_county = centroids$key) %>%
    gather(key = 'destination_county', value = 'dist', -origin_county) %>%
    mutate(dist = as.numeric(dist)/1000) %>% 
    inner_join(keys, by = c('origin_county' = 'key')) %>%
    inner_join(keys, by = c('destination_county' = 'key') )-> distm

us_pop %>%
    unite('key', STATE, COUNTY, sep = '') %>%
    filter(YEAR == 11, 
           AGEGRP==0) %>% 
    group_by(key) %>%
    summarize(n = sum(TOT_POP)) -> us_pop

usm %>%
    rename(key = origin_county) %>%
    filter(date >= as.Date('2020-01-01'), date < as.Date('2020-07-01')) %>%
    inner_join(keys) %>%
    select(date, key, device_count) -> dc

Joining, by = "L3"

“st_centroid assumes attributes are constant over geometries of x”
“st_centroid does not give correct centroids for longitude/latitude data”
Joining, by = "key"



In [4]:
df %>%
    filter(date >= as.Date('2020-01-01'), date < as.Date('2020-07-01')) %>%
    mutate(origin_county      = str_pad(origin_county, 5, pad = '0'),
           destination_county = str_pad(destination_county, 5, pad = '0')) %>%
    inner_join(keys, by = c('origin_county' = 'key')) %>%
    inner_join(keys, by = c('destination_county' = 'key')) %>%
    left_join(dc, by = c('date', 'origin_county' = 'key')) %>%
    left_join(bordering) %>%
    mutate(pod = n/device_count,
           bordering = replace_na(bordering, 0)) -> df

df %>%
    group_by(origin_county, destination_county) %>%
    summarize(n = sum(n), days = n()) -> dyad_stats

Joining, by = c("origin_county", "destination_county")



In [5]:
dyad_stats %>%
    group_by(days) %>%
    summarize(n = sum(n)) %>%
    arrange(desc(n)) %>%
    mutate(ecdf = cumsum(n)/sum(n)) %>%
    head()

days,n,ecdf
<int>,<dbl>,<dbl>
182,6798526481,0.9622046
181,18312672,0.9647964
180,10535607,0.9662875
179,8054580,0.9674275
178,6265824,0.9683143
177,5177413,0.9690471


In [6]:
dyad_stats %>%
    filter(origin_county != destination_county) %>%
    group_by(days) %>%
    summarize(n = sum(n)) %>%
    arrange(desc(n)) %>%
    mutate(ecdf = cumsum(n)/sum(n)) %>%
    head()

days,n,ecdf
<int>,<dbl>,<dbl>
182,1182397934,0.8157595
181,18312672,0.8283938
180,10535607,0.8356625
179,8054580,0.8412195
178,6265824,0.8455424
177,5177413,0.8491144


In [7]:
panel %>%
    select(date, key, PRCP, TMAX) -> weather

df %>%
    inner_join(dyad_stats %>% filter(days == max(days)) %>% select(origin_county, destination_county)) %>%
    inner_join(policy %>% 
               select(date,
                      origin_county = key,
                      op1 = p1sdp,
                      op2 = p2shp,
                      op3 = p3rop)) %>%
    inner_join(policy %>% 
               select(date,
                      destination_county = key,
                      dp1 = p1sdp,
                      dp2 = p2shp,
                      dp3 = p3rop)) %>% 
    filter(origin_county != destination_county) -> df

Joining, by = c("origin_county", "destination_county")

Joining, by = c("date", "origin_county")

Joining, by = c("date", "destination_county")



In [10]:
distm

origin_county,destination_county,dist
<chr>,<chr>,<dbl>
01001,01001,0.00000
01003,01001,225.19072
01005,01001,139.85889
01007,01001,67.91306
01009,01001,160.26703
01011,01001,100.24078
01013,01001,87.19157
01015,01001,156.86416
01017,01001,124.59170
01019,01001,205.86304


In [12]:
df %>%
    rename(ndotd = n, pdotd = pod) %>%
    mutate(dyad = str_c(origin_county, '->', destination_county),
           origin_cluster = str_sub(origin_county, 1, 2), 
           destination_cluster = str_sub(destination_county, 1, 2)) %>%
    left_join(weather %>%
              rename(origin_county = key,
                     oPRCP = PRCP,
                     oTMAX = TMAX)) %>%
    left_join(weather %>%
              rename(destination_county = key,
                     dPRCP = PRCP,
                     dTMAX = TMAX)) %>%
    left_join(us_pop, by = c('origin_county' = 'key')) %>%
    left_join(distm) %>%
    select(date, dyad, origin_cluster, destination_cluster, n, bordering, ndotd, pdotd, dist,
           op1, op2, op3, 
           dp1, dp2, dp3, 
           oPRCP, oTMAX,
           dPRCP, dTMAX) -> df

Joining, by = c("date", "origin_county")

Joining, by = c("date", "destination_county")

Joining, by = c("origin_county", "destination_county")



In [13]:
df %>%
    select(-op1, -op2, -op3, -dp1, -dp2, -dp3) %>%
    bind_cols(as.data.frame(model.matrix(n ~ 0 + (op1 + op2 + op3) * (dp1 + dp2 + dp3), df))) -> df

In [14]:
data.table::fwrite(df, str_c(fp, 'PROCESSED_DATA/dyadic_mvmt_2020.csv'))

In [None]:
df

In [None]:
fit1 <- felm(log(ood) ~ D_o + D_d | dyad + date | 0 | origin_cluster + destination_cluster, df)
summary(fit)$coefs

In [None]:
fit2 <- felm(log(ood) ~ D_o + D_d | dyad + date | 0 | origin_cluster + destination_cluster, df, weights = df$n)
summary(fit)

In [None]:
fit <- felm(log(ood) ~ D | dyad + date | 0 | origin_cluster + destination_cluster, df)
summary(fit)

In [None]:
df %>%
    group_by(D) %>%
    tally() %>%
    mutate(prop = n/sum(n))

In [None]:
df %>%
    filter(bordering == 1) %>%
    group_by(D) %>%
    tally() %>%
    mutate(prop = n/sum(n))

In [None]:
fit <- felm(log(ood) ~ D | dyad + date | 0 | origin_cluster + destination_cluster, df %>% filter(bordering == 1))
summary(fit)

In [None]:
as.data.frame(summary(fit)$coef) %>%
    mutate(D = rownames(.), 
           origin_policy = str_sub(D, 3, 4),
           destination_policy = str_sub(D, 7, 8)) %>%
    ggplot(aes(x = origin_policy, y = destination_policy, fill = Estimate)) + 
    geom_tile() +
    scale_fill_distiller()

In [None]:
fit <- felm(pod ~ D | dyad + date | 0 | origin_cluster + destination_cluster, df)
summary(fit)

In [None]:
fit <- felm(pod ~ D | dyad + date | 0 | origin_cluster + destination_cluster, df, weights = df$n)
summary(fit)

In [None]:
fit <- felm(log(pod) ~ D_o + D_d | dyad + date | 0 | origin_cluster + destination_cluster, df)
summary(fit)

In [None]:
fit <- felm(log(pod) ~ D_o + D_d | dyad + date | 0 | origin_cluster + destination_cluster, df, weights = df$n)
summary(fit)

In [None]:
library(lfe)

In [None]:
expand.grid(distinct(df$origin_county), distinct(df$destination_county))
df %>%
    filter(date == as.Date('2020-02-01')) %>%
    mutate(ndotd = n/sd(n),
           pdotd = pod/sd(pod)) %>%
    select(date, origin_county, destination_county, ndotd, pdotd) %>%
    ggplot(aes(x = origin_county, y = desitination_county))

In [None]:
expand.grid(origin_county = unique(df$origin_county), destination_county = unique(df$destination_county), stringsAsFactors = F) %>%
    left_join(df %>%
              filter(date == as.Date('2020-02-01'))) %>%
    fill(date) %>%
    replace_na(list(pod = 0, n = 0)) %>%
    mutate(ndotd = n/sd(n),
           pdotd = pod/sd(pod)) -> d21

In [None]:
library(urbnmapr)

counties_sf <- get_urbn_map(map = "counties", sf = TRUE)

In [None]:
options(repr.plot.width=5.28, repr.plot.height=3.21)
counties_sf %>%
    left_join(df %>%
              filter(origin_county == '36061', date == as.Date('2020-4-15')), 
              by = c('county_fips' = 'destination_county')) %>%
    ggplot(aes(fill = log(n))) +
    geom_sf(size = .1) +
    scale_fill_viridis(na.value = 'grey80', limits = c(0, 12)) +  
    xlab('') + 
    ylab('') +
    labs(color = "Outcome") +
    theme_light() +
    theme(text = element_text(size=15),
          strip.text.y = element_blank(),
          legend.position = 'None',
          panel.border=element_blank(),
          panel.grid.major=element_blank(), ## element_line(colour = "white")
          panel.grid.minor=element_blank(), ## element_line(colour = "white", size = 0.25)
          axis.ticks=element_blank(),       ## element_line(colour = "grey20")
          axis.text=element_blank(),        ## element_text(size = rel(0.8), colour = "grey30")
          axis.title=element_blank(),      ## axis.title.x = element_text(
          plot.margin= grid::unit(c(0, 0, 0, 0), "in"))

In [None]:
options(repr.plot.width=5.28, repr.plot.height=3.21)
counties_sf %>%
    left_join(df %>%
              filter(destination_county == '36061', date == as.Date('2020-4-15')), 
              by = c('county_fips' = 'origin_county')) %>%
    ggplot(aes(fill = log(n))) +
    geom_sf(size = .1) +
    scale_fill_viridis(na.value = 'grey80', limits = c(0, 12)) +  
    xlab('') + 
    ylab('') +
    labs(color = "Outcome") +
    theme_light() +
    theme(text = element_text(size=15),
          strip.text.y = element_blank(),
          legend.position = 'None',
          panel.border=element_blank(),
          panel.grid.major=element_blank(), ## element_line(colour = "white")
          panel.grid.minor=element_blank(), ## element_line(colour = "white", size = 0.25)
          axis.ticks=element_blank(),       ## element_line(colour = "grey20")
          axis.text=element_blank(),        ## element_text(size = rel(0.8), colour = "grey30")
          axis.title=element_blank(),      ## axis.title.x = element_text(
          plot.margin= grid::unit(c(0, 0, 0, 0), "in"))

In [None]:
df %>%
 filter(origin_county == '36061', date == as.Date('2020-2-29')) %>%
    summarize(max(n))

In [None]:
log(80308)

In [None]:
dyad