In [1]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
registerDoMC(20)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
fp <- '/pool001/mfzhao/'
weather <- read_csv(str_c(fp, 'PROCESSED_DATA/region_weather_2020.csv'))
eu_mobility <- read_csv(str_c(fp, 'PROCESSED_DATA/eu_mobility.csv'))
us_mobility <- read_csv(str_c(fp, 'PROCESSED_DATA/us_mobility.csv'))
policy <- read_csv(str_c(fp, 'PROCESSED_DATA/policy.csv'))
sci <- read_delim(str_c(fp, 'sci/SCI_NUTS3.tsv'),  "\t", escape_double = FALSE, trim_ws = TRUE)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  DATE = [34mcol_date(format = "")[39m,
  PRCP = [32mcol_double()[39m,
  TMAX = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  ds = [34mcol_date(format = "")[39m,
  btvrc = [32mcol_double()[39m,
  rstu = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  ds = [34mcol_date(format = "")[39m,
  btvrc = [32mcol_double()[39m,
  rstu = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  date = [34mcol_date(format = "")[39m,
  cluster = [31mcol_character()[39m,
  reopening = [32mcol_double()[39m,
  sdPolicy = [32mcol_double()[39m,
  stayHome = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  user_loc = [31mcol_character()[39m,
  fr_loc = [31mcol_character()[39m,

In [3]:
us_mobility %>%
    bind_rows(eu_mobility) %>%
    rename(date = ds) %>%
    filter(date >= as.Date('2020-03-01'), date < as.Date('2020-06-01')) %>%
    group_by(key) %>%
    tally() %>%
    filter(n == max(n)) %>%
    select(key) -> keys_mobility

policy %>%
    select(key) %>%
    distinct() -> keys_policy

weather %>%
    select(key) %>%
    distinct() %>%
    anti_join(weather %>% 
              filter(is.na(PRCP) | is.na(TMAX)) %>%
              select(key) %>%
              distinct()) -> keys_weather

Joining, by = "key"



In [4]:
keys_mobility %>%
    inner_join(keys_policy) %>%
    inner_join(keys_weather) -> keys

us_mobility %>%
    bind_rows(eu_mobility) %>%
    select(key, n) %>%
    distinct() %>%
    inner_join(keys) -> population

Joining, by = "key"

Joining, by = "key"

Joining, by = "key"



In [5]:
sci %>%
    mutate(user_loc = str_replace(user_loc, 'USA', ''),
           user_loc = ifelse(user_loc == 'UKN0A', 'UKN10', user_loc),
           user_loc = ifelse(user_loc == 'UKN0B', 'UKN11', user_loc),
           user_loc = ifelse(user_loc == 'UKN0C', 'UKN12', user_loc),
           user_loc = ifelse(user_loc == 'UKN0D', 'UKN13', user_loc),
           user_loc = ifelse(user_loc == 'UKN0E', 'UKN14', user_loc),
           user_loc = ifelse(user_loc == 'UKN0F', 'UKN15', user_loc),
           user_loc = ifelse(user_loc == 'UKN0G', 'UKN16', user_loc),
           fr_loc = str_replace(fr_loc, 'USA', ''),
           fr_loc = ifelse(fr_loc == 'UKN0A', 'UKN10', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0B', 'UKN11', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0C', 'UKN12', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0D', 'UKN13', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0E', 'UKN14', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0F', 'UKN15', fr_loc),
           fr_loc = ifelse(fr_loc == 'UKN0G', 'UKN16', fr_loc)) %>%
    inner_join(keys, by = c('user_loc' = 'key')) %>%
    inner_join(keys, by = c('fr_loc' = 'key'))  %>%
    inner_join(population, by = c('fr_loc' = 'key')) %>% 
    arrange(user_loc, fr_loc) -> sci

write_csv(sci, str_c(fp, 'PROCESSED_DATA/processed_sci.csv'))

In [6]:
sci %>%
    group_by(user_loc) %>%
    mutate(w = ifelse(user_loc == fr_loc, 0, scaled_sci * n),
           w = w/sum(w)) %>%
    select(key = user_loc, fr_loc, w) %>%
    spread(key = fr_loc, value = w) %>%
    ungroup(key) %>%
    arrange(key) %>%
    select(-key) %>%
    as.matrix() -> WM

rownames(WM) <- keys$key

write_rds(WM, str_c(fp, 'PROCESSED_DATA/WM.RDS'))

In [7]:
bind_rows(us_mobility, eu_mobility) %>%
    rename(date = ds) %>%
    inner_join(keys) %>%
    inner_join(policy) %>%
    inner_join(weather, by = c('key', 'date' = 'DATE')) %>%
    arrange(date, key) -> df

Joining, by = "key"

Joining, by = c("key", "date")



In [8]:
weightedAlters <- function(df, wm, ...) {
    df %>% 
        select(date, key, ...) %>%
        spread(key = key, value = ...) %>%
        ungroup() %>%
        select(-date) %>%
        as.matrix() -> txn_data
    
    df %>%
        ungroup() %>%
        select(date) %>%
        distinct() %>%
        arrange(date) -> dates
    
    outMatrix <- tcrossprod(txn_data, wm)
    colnames(outMatrix) <- colnames(txn_data)
    
    data.frame(dates, outMatrix) %>%
        gather(key = 'key', value = 'value', -date) %>%
        arrange(date, key) %>%
        select(-date, -key) -> out_df
    return(out_df$value)
}

In [9]:
nquantiles <- 20

df %>%
    ungroup() %>%
    select(date, key, PRCP, TMAX) %>%
    mutate(PRCP = ifelse(PRCP == 0, NA, PRCP),
           pq = ifelse(is.na(PRCP), 0, ntile(PRCP, nquantiles)), 
           tq = ntile(TMAX, nquantiles)) -> weather_quantiles

wq_ind <- function(thres) {
    weather_quantiles %>%
        transmute(tempname1 = as.numeric(pq >= thres),
                  tempname2 = as.numeric(tq >= thres)) -> temp
    colnames(temp) <- c(str_c('prcp', str_pad(thres, 2, pad = '0')), str_c('tmax', str_pad(thres, 2, pad = '0'))) 
    return(temp)
}

wq_inds <- foreach(i = 1:nquantiles, .combine = cbind) %dopar% wq_ind(i)
wq_colnames <- c(str_c('prcp', str_pad(1:nquantiles, 2, pad = '0')), str_c('tmax', str_pad(1:nquantiles, 2, pad = '0')))

df %>%
    bind_cols(wq_inds[wq_colnames] %>% select(-tmax01)) -> df

In [10]:
df %>%
    mutate_at(vars(btvrc, rstu, contains('prcp'), contains('tmax')), 
              .funs = list(sdp = function(x) x * df$sdPolicy,
                           shp = function(x) x * df$stayHome,
                           rop = function(x) x * df$reopening)) -> df
                           
df %>%
    select(-date, -key, -n, -cluster, -PRCP, -TMAX) %>%
    colnames() -> cols_to_alterize

alters <- foreach(i = 1:length(cols_to_alterize), .combine = cbind) %dopar% 
    weightedAlters(df, WM, cols_to_alterize[i])

colnames(alters) <- str_c('alter_', cols_to_alterize)

In [11]:
df %>%
    bind_cols(as.data.frame(alters)) %>%
    mutate_at(vars(matches('alter')), 
              .funs = list(Xego_sdp = function(x) x * df$sdPolicy,
                           Xego_shp = function(x) x * df$stayHome,
                           Xego_rop = function(x) x * df$reopening)) -> df

In [12]:
write_rds(df, str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))