In [4]:
library(dplyr)
library(stringr)
library(readr)
library(tidyr)
library(lfe)
library(glmnet)
library(doMC)
registerDoMC(28)

In [5]:
fp <- '/pool001/mfzhao/'
df.r <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_xgr.RDS'))
folds <- read_rds(str_c(fp, 'PROCESSED_DATA/folds.RDS'))
residualizer_df <- read_rds(str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

In [6]:
instruments_prcp     <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_prcp..\\.r')], collapse = ' + '), ')', collapse = '')
instruments_tmax     <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_tmax..\\.r')], collapse = ' + '), ')', collapse = '')
instruments_prcp_sdp <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_prcp.._sdp\\.r')], collapse = ' + '), ')', collapse = '')
instruments_tmax_sdp <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_tmax.._sdp\\.r')], collapse = ' + '), ')', collapse = '')
instruments_prcp_shp <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_prcp.._shp\\.r')], collapse = ' + '), ')', collapse = '')
instruments_tmax_shp <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_tmax.._shp\\.r')], collapse = ' + '), ')', collapse = '')
instruments_prcp_rop <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_prcp.._rop\\.r')], collapse = ' + '), ')', collapse = '')
instruments_tmax_rop <- str_c('(', str_c(colnames(df.r)[str_detect(colnames(df.r), 'alter_tmax.._rop\\.r')], collapse = ' + '), ')', collapse = '')

instruments <- str_c(instruments_prcp, ' * ', instruments_tmax, ' + ',
                     instruments_prcp_sdp, ' * ', instruments_tmax_sdp, ' + ',
                     instruments_prcp_shp, ' * ', instruments_tmax_shp, ' + ',
                     instruments_prcp_rop, ' * ', instruments_tmax_rop)

In [7]:
X.r <- model.matrix(as.formula(str_c('btvrc.r ~ 0 + ', instruments)), data = df.r)

df.r %>%
    select(key, date, sdPolicy.r, stayHome.r, reopening.r, n) -> residualizer_df2

exogVar_residualizer <- function(Y, colname) {
    residualizer_df2 %>%
        mutate(Y = Y) -> temp_df
    
    fit <- felm(Y ~ 0 | key + date, temp_df, weights = temp_df$n)
    out <- data.frame(fit$resid)
    colnames(out)[1] <- colname
    return(out)
}     
X.r2 <- foreach(i = 1:ncol(X.r), .combine = cbind) %dopar% exogVar_residualizer(X.r[,i], colnames(X.r)[i]) 

In [8]:
df.r %>% 
    select(alter_rstu.r,
           alter_btvrc.r) -> DVs

Y <- foreach(i = 1:ncol(DVs), .combine = cbind) %dopar% exogVar_residualizer(DVs[[i]], colnames(DVs)[i])      

In [9]:
# Running cross-validated lasso, to select optimal lambda (based on CV prediction performance)
X.r2 <- as.matrix(X.r2)
Y    <- as.matrix(Y)


cvlasso <- cv.glmnet(X.r2, Y, 
                     intercept = FALSE,
                     family = 'mgaussian',
                     alpha = 1,
                     weights = df.r$n,
                     foldid = df.r$fold,
                     standardize = FALSE,
                     parallel = TRUE,
                     nlambda = 100)

# Extracting selected instruments]
fs.lasso.coefs <- coef(cvlasso, s = cvlasso$lambda.1se)[[1]]
selected.cols  <- which(fs.lasso.coefs != 0)
selected.names <- rownames(fs.lasso.coefs)[selected.cols]

In [10]:
write_rds(cvlasso, str_c(fp, 'MODELS/cvlasso.RDS'))

In [11]:
gen_instruments_with_suffix <- function(suffix) {
    instruments_prcp     <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_prcp..", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_tmax     <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_tmax..", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_prcp_sdp <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_prcp.._sdp", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_tmax_sdp <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_tmax.._sdp", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_prcp_shp <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_prcp.._shp", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_tmax_shp <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_tmax.._shp", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_prcp_rop <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_prcp.._rop", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    instruments_tmax_rop <- str_c("(", str_c(colnames(df.r)[str_detect(colnames(df.r), str_c("alter_tmax.._rop", suffix, "\\.r"))], collapse = " + "), ")", collapse = "")
    
    instruments <- str_c(instruments_prcp, " * ", instruments_tmax, " + ", 
                         instruments_prcp_sdp, " * ", instruments_tmax_sdp, " + ", 
                         instruments_prcp_shp, " * ", instruments_tmax_shp, " + ", 
                         instruments_prcp_rop, " * ", instruments_tmax_rop)
    
    X.r <- model.matrix(as.formula(str_c("btvrc.r ~ 0 + ", instruments)), data = df.r)
    
    selected.suffix <- str_replace_all(selected.names, "\\.r", str_c(suffix, "\\.r"))
    iv <- as.data.frame(X.r) %>%
        select(all_of(selected.suffix))
    colnames(iv) <- str_c("iv", str_pad(1:length(selected.names), 3, pad = "0"), str_replace(suffix, "_Xego", ""))
    
    if(str_detect(suffix, '_l[1-7]')) {
        iv %>%
            bind_cols(data.frame(key = unique(df.r$key), 
                                 date = rep(seq.Date(as.Date('2020-03-01') + as.numeric(str_sub(suffix, -1, -1)), as.Date('2020-05-31'), 'day'), each = length(unique(df.r$key))),
                                 stringsAsFactors = F)) -> iv
        
        data.frame(key = unique(df.r$key), 
                   date = rep(seq.Date(as.Date('2020-03-01'), as.Date('2020-05-31'), 'day'), each = length(unique(df.r$key))),
                   stringsAsFactors = F) %>%
            left_join(iv) %>%
            select(-key, -date) -> iv
        
        return(iv)
    } else {return(iv)}
}

In [12]:
expand.grid(X1 = c('', '_Xego_sdp', '_Xego_shp', '_Xego_rop'), X2 = c('', str_c('_l', 1:7)), stringsAsFactors = F) %>%
    mutate(suffix = str_c(X1, X2)) %>%
    select(suffix) -> suffixes

In [13]:
iv <- foreach(suffix = suffixes$suffix, .combine = cbind) %do% gen_instruments_with_suffix(suffix)

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")

Joining, by = c("key", "date")



In [18]:
df.r %>%
    select(-matches('alter_[pt][rm][ca][px]..')) %>%
    bind_cols(iv) -> panel

In [21]:
write_rds(panel, str_c(fp, 'PROCESSED_DATA/panel_xgr_with_instruments.RDS'))