In [8]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(20)

In [9]:
fp <- '/pool001/mfzhao/'
df <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))

In [10]:
df %>%
    arrange(key, date) %>%
    group_by(key) %>%
    mutate_at(vars(p1sdp, p2shp, p3rop, stalter_p1sdp, stalter_p2shp, stalter_p3rop),
              .funs = list(l01 = function(x) lag(x),
                           l02 = function(x) lag(x, 2),
                           l03 = function(x) lag(x, 3),
                           l04 = function(x) lag(x, 4),
                           l05 = function(x) lag(x, 5),
                           l06 = function(x) lag(x, 6),
                           l07 = function(x) lag(x, 7),
                           l08 = function(x) lag(x, 8),
                           l09 = function(x) lag(x, 9),
                           l10 = function(x) lag(x, 10),
                           m01 = function(x) lead(x),
                           m02 = function(x) lead(x, 2),
                           m03 = function(x) lead(x, 3),
                           m04 = function(x) lead(x, 4),
                           m05 = function(x) lead(x, 5),
                           m06 = function(x) lead(x, 6),
                           m07 = function(x) lead(x, 7),
                           m08 = function(x) lead(x, 8),
                           m09 = function(x) lead(x, 9),
                           m10 = function(x) lead(x, 10))) -> df

df %>%
    fill(matches("m[01][0-9]"), .direction = 'down') %>%
    fill(matches("l[01][0-9]"), .direction = 'up') -> df

In [11]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(key) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, key) %>%
    ungroup() -> df

df %>%
    ungroup() %>%
    mutate(PRCP.r_fe      = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe      = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid,
           newcases.r_fe  = felm(newcases ~ 0 | key + date, ., weights = df$n)$resid,
           newdeaths.r_fe = felm(newdeaths ~ 0 | key + date, ., weights = df$n)$resid,
           newcases_state.r_fe  = felm(newcases ~ 0 | key + date, ., weights = df$n)$resid,
           newdeaths_state.r_fe = felm(newdeaths ~ 0 | key + date, ., weights = df$n)$resid) %>%
    select(date, 
           key, 
           PRCP.r_fe, 
           TMAX.r_fe, 
           newcases.r_fe, 
           newdeaths.r_fe, 
           newcases_state.r_fe, 
           newdeaths_state.r_fe, 
           fold, 
           n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/folds.RDS'))
write_rds(residualizer_df, str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

Joining, by = "key"



In [15]:
XGresidualizer <- function(Y, colname) {
    print(colname)
    residualizer_df %>%
        ungroup() %>% 
        mutate(Y = Y,
               Y.r = felm(Y ~ 0 | key + date, ., weights = residualizer_df$n)$resid) -> temp_df
  
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))
    
    X     <- model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe + 
                            newcases_state.r_fe + newdeaths_state.r_fe, temp_df)
    dm    <- xgb.DMatrix(data = X, label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    fit   <- xgb.cv(params = param, 
                    data = dm, 
                    folds = folds,
                    nrounds = 100, 
                    early_stopping_rounds = 3, 
                    weight = temp_df$n,
                    nthread = 20)
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe + 
                                                 newcases_state.r_fe + newdeaths_state.r_fe, tr), label = tr$Y.r)
        fit <- xgb.train(params = param, 
                         data = trm, 
                         nrounds = best_n, 
                         weight = tr$n,
                         nthread = 20)
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe + 
                                                 newcases_state.r_fe + newdeaths_state.r_fe, te), label = te$Y.r)
        te %>%
            select(date, key) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        assign(str_c('temp',i), pred_df) %>%
        select(-pred, -date, -key)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, key) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -key)
    
    colnames(out) <- str_c(colname, '.r')
    return(out)
}

In [16]:
cols_to_xgr <- colnames(
    df %>%
        select(
            -cases,
            -newcases,
            -deaths,
            -newdeaths,
            -matches('^d'),
            -matches('^pc'),
            -matches('alter_d'),
            -matches('alter_pc'),
            -matches('log_d'),
            -matches('log_pc'),
            -date,
            -key,
            -cluster,
            -n,
            -fold,
            -matches('PRCP', ignore.case = F),
            -matches('TMAX', ignore.case = F),
            -matches('^prcp..'),
            -matches('^tmax..'),
            -matches('alter_.*_p[123]...$')
         )
)
cols_to_xgr

In [None]:
xg.residuals <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

In [None]:
df %>%
    select(key, date, n, cluster, fold) %>%
    bind_cols(xg.residuals) -> df.r

In [None]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/panel_xgr_v2.RDS'))