In [17]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(24)

In [18]:
fp <- '/pool001/mfzhao/'
df <- read_rds(str_c(fp, 'PROCESSED_DATA/panel_pre_xgr.RDS'))

In [19]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(key) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, key) %>%
    ungroup() -> df

df %>%
    ungroup() %>%
    mutate(PRCP.r_fe = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid) %>%
    select(date, key, PRCP.r_fe, TMAX.r_fe, fold, n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/folds.RDS'))
write_rds(residualizer_df, str_c(fp, 'PROCESSED_DATA/residualizer_data.RDS'))

Joining, by = "key"



In [20]:
XGresidualizer <- function(Y, colname) {
    print(colname)
    if(str_detect(colname, 'l[1-7]')) {
        residualizer_df %>%
            mutate(Y = Y) %>%
            group_by(key) %>%
            arrange(key, date) %>%
            filter(row_number() > as.numeric(str_sub(colname, -1, -1))) %>%
            arrange(date, key) %>% 
            ungroup() -> temp_df

        temp_df %>%
            ungroup() %>%
            arrange(date, key) %>%
            mutate(Y.r = felm(Y ~ 0 | key + date, ., weights = temp_df$n)$resid) %>%
            arrange(date, key) -> temp_df
        
    } else {
        residualizer_df %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | key + date, ., weights = residualizer_df$n)$resid) -> temp_df
    }
    
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))

    dm    <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, temp_df), label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    fit   <- xgb.cv(params = param, 
                    data = dm, 
                    folds = folds,
                    nrounds = 100, 
                    early_stopping_rounds = 3, 
                    weight = temp_df$n)
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, tr), label = tr$Y.r)
        fit <- xgb.train(params = param, 
                         data = trm, 
                         nrounds = best_n, 
                         weight = tr$n)
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(~ 0 + PRCP.r_fe + TMAX.r_fe, te), label = te$Y.r)
        te %>%
            select(date, key) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        assign(str_c('temp',i), pred_df) %>%
        select(-pred, -date, -key)
    }
    if(str_detect(colname, 'l[1-7]')) {
        out <- residualizer_df %>%
            select(date, key) %>%
            left_join(bind_rows(temp1, temp2, temp3) %>%
                      mutate(tempname = temp_df$Y.r - pred)) %>% 
            arrange(date, key) %>%
            select(-pred, -date, -key)
    } else {
        out <- bind_rows(temp1, temp2, temp3) %>%
            arrange(date, key) %>%
            mutate(tempname = temp_df$Y.r - pred) %>%
            select(-pred, -date, -key)
    }
    colnames(out) <- str_c(colname, '.r')
    return(out)
}

In [21]:
cols_to_xgr <- colnames(
    df %>%
        select(
            -matches('^d'),
            -matches('^pc'),
            -matches('alter_d'),
            -matches('alter_pc'),
            -matches('log_d'),
            -matches('log_pc'),
            -date,
            -key,
            -cluster,
            -n,
            -fold,
            -matches('PRCP', ignore.case = F),
            -matches('TMAX', ignore.case = F),
            -matches('^prcp..'),
            -matches('^tmax..')
         )
)
cols_to_xgr

In [22]:
xg.residuals <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

[1] "mcbgv"
[1]	train-rmse:0.269712+0.000259	test-rmse:0.269742+0.000523 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.160269+0.000437	test-rmse:0.160316+0.000900 
[3]	train-rmse:0.117765+0.000579	test-rmse:0.117874+0.001196 
[4]	train-rmse:0.104349+0.000652	test-rmse:0.104466+0.001321 
[5]	train-rmse:0.100589+0.000678	test-rmse:0.100744+0.001375 
[6]	train-rmse:0.099537+0.000695	test-rmse:0.099707+0.001389 
[7]	train-rmse:0.099213+0.000681	test-rmse:0.099396+0.001400 
[8]	train-rmse:0.099096+0.000696	test-rmse:0.099274+0.001375 
[9]	train-rmse:0.099031+0.000692	test-rmse:0.099221+0.001377 
[10]	train-rmse:0.098983+0.000687	test-rmse:0.099185+0.001375 
[11]	train-rmse:0.098957+0.000694	test-rmse:0.099172+0.001379 
[12]	train-rmse:0.098932+0.000697	test-rmse:0.099162+0.001377 
[13]	train-rmse:0.098902+0.000699	test-rmse:0.099129+0.001376 
[14]	train-rmse:0.098890+0.000700	test-rmse:0.0

In [23]:
df %>%
    select(key, date, n, cluster, fold) %>%
    bind_cols(xg.residuals) -> df.r

In [24]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/panel_xgr.RDS'))