In [44]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(24)

In [45]:
fp <- '/pool001/mfzhao/'
df <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))

In [46]:
df %>%
    arrange(key, date) %>%
    group_by(key) %>%
    mutate_at(vars(p1sdp, p2shp, p3rop, stalter_p1sdp, stalter_p2shp, stalter_p3rop),
              .funs = list(l01 = function(x) lag(x),
                           l02 = function(x) lag(x, 2),
                           l03 = function(x) lag(x, 3),
                           l04 = function(x) lag(x, 4),
                           l05 = function(x) lag(x, 5),
                           l06 = function(x) lag(x, 6),
                           l07 = function(x) lag(x, 7),
                           l08 = function(x) lag(x, 8),
                           l09 = function(x) lag(x, 9),
                           l10 = function(x) lag(x, 10),
                           m01 = function(x) lead(x),
                           m02 = function(x) lead(x, 2),
                           m03 = function(x) lead(x, 3),
                           m04 = function(x) lead(x, 4),
                           m05 = function(x) lead(x, 5),
                           m06 = function(x) lead(x, 6),
                           m07 = function(x) lead(x, 7),
                           m08 = function(x) lead(x, 8),
                           m09 = function(x) lead(x, 9),
                           m10 = function(x) lead(x, 10))) -> df

df %>%
    fill(matches("m[01][0-9]"), .direction = 'down') %>%
    fill(matches("l[01][0-9]"), .direction = 'up') -> df

In [47]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(key) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, key) %>%
    ungroup() -> df

df %>%
    ungroup() %>%
    mutate(PRCP.r_fe      = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe      = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid,
           newcases.r_fe  = felm(newcases ~ 0 | ky + date, ., weights = df$n),
           newdeaths.r_fe = felm(newcases ~ 0 | ky + date, ., weights = df$n)) %>%
    select(date, key, PRCP.r_fe, TMAX.r_fe, newcases.r_fe, newdeaths.r_fe, fold, n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/folds.RDS'))
write_rds(residualizer_df, str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

Joining, by = "key"



ERROR: Error in eval(predvars, data, env): object 'ky' not found


In [38]:
XGresidualizer <- function(Y, colname) {
    print(colname)
    residualizer_df %>%
        ungroup() %>% 
        mutate(Y = Y,
               Y.r = felm(Y ~ 0 | key + date, ., weights = residualizer_df$n)$resid) -> temp_df
  
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))

    dm    <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, temp_df), label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    fit   <- xgb.cv(params = param, 
                    data = dm, 
                    folds = folds,
                    nrounds = 100, 
                    early_stopping_rounds = 3, 
                    weight = temp_df$n)
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, tr), label = tr$Y.r)
        fit <- xgb.train(params = param, 
                         data = trm, 
                         nrounds = best_n, 
                         weight = tr$n)
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, te), label = te$Y.r)
        te %>%
            select(date, key) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        assign(str_c('temp',i), pred_df) %>%
        select(-pred, -date, -key)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, key) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -key)
    
    colnames(out) <- str_c(colname, '.r')
    return(out)
}

In [39]:
cols_to_xgr <- colnames(
    df %>%
        select(
            -matches('^d'),
            -matches('^pc'),
            -matches('alter_d'),
            -matches('alter_pc'),
            -matches('log_d'),
            -matches('log_pc'),
            -date,
            -key,
            -cluster,
            -n,
            -fold,
            -matches('PRCP', ignore.case = F),
            -matches('TMAX', ignore.case = F),
            -matches('^prcp..'),
            -matches('^tmax..')
         )
)
cols_to_xgr

In [40]:
xg.residuals <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

[1] "mcbgv"
[1]	train-rmse:0.280492+0.000155	test-rmse:0.280507+0.000202 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.177547+0.000240	test-rmse:0.177582+0.000353 
[3]	train-rmse:0.140205+0.000255	test-rmse:0.140267+0.000533 
[4]	train-rmse:0.129028+0.000260	test-rmse:0.129098+0.000643 
[5]	train-rmse:0.125999+0.000285	test-rmse:0.126084+0.000667 
[6]	train-rmse:0.125160+0.000308	test-rmse:0.125249+0.000660 
[7]	train-rmse:0.124885+0.000329	test-rmse:0.124979+0.000649 
[8]	train-rmse:0.124781+0.000325	test-rmse:0.124883+0.000656 
[9]	train-rmse:0.124718+0.000311	test-rmse:0.124823+0.000674 
[10]	train-rmse:0.124685+0.000300	test-rmse:0.124786+0.000693 
[11]	train-rmse:0.124651+0.000304	test-rmse:0.124758+0.000689 
[12]	train-rmse:0.124632+0.000302	test-rmse:0.124740+0.000693 
[13]	train-rmse:0.124593+0.000315	test-rmse:0.124707+0.000684 
[14]	train-rmse:0.124573+0.000323	test-rmse:0.1

In [41]:
df %>%
    select(key, date, n, cluster, fold) %>%
    bind_cols(xg.residuals) -> df.r

In [42]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/panel_xgr.RDS'))

In [43]:
df.r

key,date,n,cluster,fold,mcbgv.r,pnchd.r,pgt1hafh.r,ppthlt75.r,pgt2kmt.r,⋯,p3rop_m09.r,stalter_p1sdp_m09.r,stalter_p2shp_m09.r,stalter_p3rop_m09.r,p1sdp_m10.r,p2shp_m10.r,p3rop_m10.r,stalter_p1sdp_m10.r,stalter_p2shp_m10.r,stalter_p3rop_m10.r
<chr>,<date>,<dbl>,<chr>,<dbl>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",⋯,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>"
01001,2020-01-01,55601,01,3,-0.3219299,-0.05528435,-0.06805887,-0.035415007,-0.08127411,⋯,-0.06568981,0.0032679460,2.702140e-03,-0.015784933,0.01542153,0.0022643924,-0.07004554,0.0027204177,0.0025632775,-0.014873906
01003,2020-01-01,218022,01,1,-0.1459469,-0.03971310,-0.04311287,-0.031280426,-0.05519608,⋯,-0.06363196,0.0023357240,3.177202e-03,-0.014092173,0.01518502,0.0150769174,-0.06404991,0.0026867358,0.0025625134,-0.014301058
01005,2020-01-01,24881,01,1,-0.2709878,-0.04286810,-0.03624190,-0.031056208,-0.06804965,⋯,-0.07474429,0.0025496241,1.001933e-03,-0.018935346,0.01383247,0.0215068281,-0.07616363,0.0029742476,0.0009009172,-0.018240595
01007,2020-01-01,22400,01,1,-0.3182871,-0.05898623,-0.06364087,-0.051262030,-0.08881721,⋯,-0.06321682,0.0037039339,-4.630645e-03,-0.016055086,0.01648744,0.0023563624,-0.06444479,0.0031552135,-0.0039472095,-0.016701350
01009,2020-01-01,57840,01,1,-0.3838088,-0.06860463,-0.07644852,-0.058009929,-0.10039315,⋯,-0.07237385,0.0047257625,4.525518e-03,-0.020154641,0.01794304,0.0191047609,-0.07459889,0.0045289479,0.0046301362,-0.019978628
01011,2020-01-01,10138,01,1,-0.2736475,-0.07558680,-0.06856821,-0.054753559,-0.07317672,⋯,-0.07237385,0.0030830674,-6.343892e-03,-0.013703307,0.01631971,0.0157105744,-0.07459889,0.0031943492,-0.0062477980,-0.012943020
01013,2020-01-01,19680,01,1,-0.3132411,-0.04708344,-0.03545699,-0.009831665,-0.08001405,⋯,-0.07237385,0.0022668148,-5.800554e-03,-0.012378501,0.01631971,0.0157105744,-0.07459889,0.0020139719,-0.0054604477,-0.011631386
01015,2020-01-01,114277,01,3,-0.2818317,-0.05615520,-0.06858275,-0.042955938,-0.07746415,⋯,-0.06875123,0.0052681795,-2.720582e-03,-0.020754638,0.01715841,0.0022643924,-0.06987215,0.0046535364,-0.0018685951,-0.020497265
01017,2020-01-01,33615,01,2,-0.3442712,-0.05205807,-0.04413948,-0.047601906,-0.06517553,⋯,-0.07237540,0.0069037077,-7.761378e-04,-0.038008299,0.01786204,0.0148382008,-0.07156889,0.0067799089,-0.0008972345,-0.037573960
01019,2020-01-01,26032,01,3,-0.2742130,-0.08480877,-0.07735050,-0.053209336,-0.09440203,⋯,-0.06821243,0.0071092028,-2.584812e-03,-0.032603093,0.01657255,0.0022643924,-0.07004554,0.0060242790,-0.0028624786,-0.032670119
