In [3]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(20)

In [39]:
fp <- '/pool001/mfzhao/'
df <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))

In [40]:
df %>%
    select(key,
           date, 
           cluster,
           n,
           log_mcbgv,
           log_pnchd,
           log_pgt1hafh, 
           log_ppthlt75,
           log_pgt2kmt,
           matches('unemp'),
           matches('^[PT][RM][CA][PX]$'),
           matches('^p[123]...$'),
           matches('stalter_p[123]...$'),
           matches('^as_(st)?new')) -> df

colnames(df)

In [41]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(key) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, key) %>%
    ungroup() -> df

df %>%
    ungroup() %>%
    mutate(PRCP.r_fe           = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe           = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid,
           as_newcases.r_fe    = felm(as_newcases ~ 0 | key + date, ., weights = df$n)$resid,
           as_newdeaths.r_fe   = felm(as_newdeaths ~ 0 | key + date, ., weights = df$n)$resid,
           as_stnewcases.r_fe  = felm(as_stnewcases ~ 0 | key + date, ., weights = df$n)$resid,
           as_stnewdeaths.r_fe = felm(as_stnewdeaths ~ 0 | key + date, ., weights = df$n)$resid,
           ct_unemp_rate.r_fe  = felm(ct_unemp_rate ~ 0 | key + date, ., weights = df$n)$resid,
           st_unemp_rate.r_fe  = felm(st_unemp_rate ~ 0 | key + date, ., weights = df$n)$resid) %>%
    select(date, 
           key, 
           PRCP.r_fe, 
           TMAX.r_fe, 
           as_newcases.r_fe, 
           as_newdeaths.r_fe, 
           as_stnewcases.r_fe, 
           as_stnewdeaths.r_fe, 
           ct_unemp_rate.r_fe,
           st_unemp_rate.r_fe,
           fold, 
           n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/folds.RDS'))
write_rds(residualizer_df, str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

Joining, by = "key"



In [51]:
XGresidualizer <- function(Y, colname) {
    print(colname)
    residualizer_df %>%
        ungroup() %>% 
        mutate(Y = Y,
               Y.r = felm(Y ~ 0 | key + date, ., weights = residualizer_df$n)$resid) -> temp_df
  
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))
    
    form <- ~ 0 + PRCP.r_fe + TMAX.r_fe + as_newcases.r_fe + as_newdeaths.r_fe + as_stnewcases.r_fe + as_stnewdeaths.r_fe + ct_unemp_rate.r_fe + st_unemp_rate.r_fe
    
    X     <- model.matrix(form, temp_df)
    dm    <- xgb.DMatrix(data = X, label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    fit   <- xgb.cv(params = param, 
                    data = dm, 
                    folds = folds,
                    nrounds = 100, 
                    early_stopping_rounds = 3, 
                    weight = temp_df$n,
                    nthread = 20)
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(form, tr), label = tr$Y.r)
        fit <- xgb.train(params = param, 
                         data = trm, 
                         nrounds = best_n, 
                         weight = tr$n,
                         nthread = 20)
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(form, te), label = te$Y.r)
        te %>%
            select(date, key) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        assign(str_c('temp',i), pred_df) %>%
        select(-pred, -date, -key)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, key) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -key)
    
    colnames(out) <- str_c(colname, '.r')
    return(out)
}

In [52]:
cols_to_xgr <- colnames(
    df %>%
        select(
            matches('^log_'),
            matches('p[123]...')
         )
)
cols_to_xgr

In [53]:
foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% 
    XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i]) -> xg.residuals

[1] "log_mcbgv"
[1]	train-rmse:0.255595+0.000042	test-rmse:0.255721+0.000032 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.134506+0.000104	test-rmse:0.134579+0.000061 
[3]	train-rmse:0.078918+0.000183	test-rmse:0.079015+0.000197 
[4]	train-rmse:0.056801+0.000265	test-rmse:0.056940+0.000351 
[5]	train-rmse:0.049477+0.000286	test-rmse:0.049626+0.000464 
[6]	train-rmse:0.047288+0.000314	test-rmse:0.047439+0.000485 
[7]	train-rmse:0.046568+0.000325	test-rmse:0.046727+0.000500 
[8]	train-rmse:0.046240+0.000292	test-rmse:0.046403+0.000546 
[9]	train-rmse:0.046087+0.000288	test-rmse:0.046253+0.000552 
[10]	train-rmse:0.045963+0.000288	test-rmse:0.046133+0.000576 
[11]	train-rmse:0.045860+0.000306	test-rmse:0.046042+0.000560 
[12]	train-rmse:0.045783+0.000305	test-rmse:0.045970+0.000552 
[13]	train-rmse:0.045692+0.000299	test-rmse:0.045881+0.000563 
[14]	train-rmse:0.045633+0.000297	test-rmse

In [54]:
df %>%
    select(key, date, n, cluster, fold) %>%
    bind_cols(xg.residuals) -> df.r

In [55]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/panel_xgr_v3.RDS'))