In [1]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(24)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: foreach

Loading required package: iterators

Loading required package: parallel

Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack



Attaching package: ‘xgboost’


The following object is masked from ‘package:dplyr’:

    slice




In [2]:
fp <- '/pool001/mfzhao/'
df <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))

In [3]:
df %>%
    arrange(key, date) %>%
    group_by(key) %>%
    mutate_at(vars(p1sdp, p2shp, p3rop, stalter_p1sdp, stalter_p2shp, stalter_p3rop),
              .funs = list(l01 = function(x) lag(x),
                           l02 = function(x) lag(x, 2),
                           l03 = function(x) lag(x, 3),
                           l04 = function(x) lag(x, 4),
                           l05 = function(x) lag(x, 5),
                           l06 = function(x) lag(x, 6),
                           l07 = function(x) lag(x, 7),
                           l08 = function(x) lag(x, 8),
                           l09 = function(x) lag(x, 9),
                           l10 = function(x) lag(x, 10),
                           m01 = function(x) lead(x),
                           m02 = function(x) lead(x, 2),
                           m03 = function(x) lead(x, 3),
                           m04 = function(x) lead(x, 4),
                           m05 = function(x) lead(x, 5),
                           m06 = function(x) lead(x, 6),
                           m07 = function(x) lead(x, 7),
                           m08 = function(x) lead(x, 8),
                           m09 = function(x) lead(x, 9),
                           m10 = function(x) lead(x, 10))) -> df

df %>%
    fill(matches("m[01][0-9]"), .direction = 'down') %>%
    fill(matches("l[01][0-9]"), .direction = 'up') -> df

In [4]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(key) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, key) %>%
    ungroup() -> df

df %>%
    ungroup() %>%
    mutate(PRCP.r_fe      = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe      = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid,
           newcases.r_fe  = felm(newcases ~ 0 | key + date, ., weights = df$n)$resid,
           newdeaths.r_fe = felm(newdeaths ~ 0 | key + date, ., weights = df$n)$resid) %>%
    select(date, key, PRCP.r_fe, TMAX.r_fe, newcases.r_fe, newdeaths.r_fe, fold, n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/folds.RDS'))
write_rds(residualizer_df, str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

Joining, by = "key"



In [5]:
XGresidualizer <- function(Y, colname) {
    print(colname)
    residualizer_df %>%
        ungroup() %>% 
        mutate(Y = Y,
               Y.r = felm(Y ~ 0 | key + date, ., weights = residualizer_df$n)$resid) -> temp_df
  
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))
    
    X     <- model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe, temp_df)
    dm    <- xgb.DMatrix(data = X, label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    fit   <- xgb.cv(params = param, 
                    data = dm, 
                    folds = folds,
                    nrounds = 100, 
                    early_stopping_rounds = 3, 
                    weight = temp_df$n)
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe, tr), label = tr$Y.r)
        fit <- xgb.train(params = param, 
                         data = trm, 
                         nrounds = best_n, 
                         weight = tr$n)
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe + newcases.r_fe + newdeaths.r_fe, te), label = te$Y.r)
        te %>%
            select(date, key) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        assign(str_c('temp',i), pred_df) %>%
        select(-pred, -date, -key)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, key) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -key)
    
    colnames(out) <- str_c(colname, '.r')
    return(out)
}

In [6]:
cols_to_xgr <- colnames(
    df %>%
        select(
            -cases,
            -newcases,
            -deaths,
            -newdeaths,
            -matches('^d'),
            -matches('^pc'),
            -matches('alter_d'),
            -matches('alter_pc'),
            -matches('log_d'),
            -matches('log_pc'),
            -date,
            -key,
            -cluster,
            -n,
            -fold,
            -matches('PRCP', ignore.case = F),
            -matches('TMAX', ignore.case = F),
            -matches('^prcp..'),
            -matches('^tmax..')
         )
)
cols_to_xgr

In [7]:
xg.residuals <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

[1] "mcbgv"
[1]	train-rmse:0.272738+0.000164	test-rmse:0.272762+0.000216 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.161485+0.000273	test-rmse:0.161605+0.000155 
[3]	train-rmse:0.117707+0.000456	test-rmse:0.117877+0.000199 
[4]	train-rmse:0.103434+0.000494	test-rmse:0.103681+0.000427 
[5]	train-rmse:0.099211+0.000449	test-rmse:0.099481+0.000659 
[6]	train-rmse:0.097933+0.000446	test-rmse:0.098217+0.000739 
[7]	train-rmse:0.097403+0.000417	test-rmse:0.097705+0.000813 
[8]	train-rmse:0.097120+0.000397	test-rmse:0.097439+0.000861 
[9]	train-rmse:0.096925+0.000366	test-rmse:0.097249+0.000910 
[10]	train-rmse:0.096795+0.000380	test-rmse:0.097120+0.000898 
[11]	train-rmse:0.096657+0.000395	test-rmse:0.097018+0.000861 
[12]	train-rmse:0.096559+0.000424	test-rmse:0.096966+0.000845 
[13]	train-rmse:0.096486+0.000432	test-rmse:0.096890+0.000850 
[14]	train-rmse:0.096423+0.000440	test-rmse:0.0

In [8]:
df %>%
    select(key, date, n, cluster, fold) %>%
    bind_cols(xg.residuals) -> df.r

In [9]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/panel_xgr_v2.RDS'))