In [6]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(xgboost)
registerDoMC(24)

In [7]:
fp    <- '/pool001/mfzhao/'
df    <- data.table::fread('/pool001/mfzhao/PROCESSED_DATA/dyadic_mvmt_2020.csv')

In [32]:
df %>%
    mutate(log_ndotd = log(ndotd),
           log_pdotd = log(pdotd)) -> df

In [13]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(dyad) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, dyad) %>%
    ungroup() -> df


Joining, by = "dyad"



In [19]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe  = felm(oPRCP ~ 0 | dyad + date, .)$resid,
           oTMAX.r_fe  = felm(oTMAX ~ 0 | dyad + date, .)$resid,
           dPRCP.r_fe  = felm(dPRCP ~ 0 | dyad + date, .)$resid,
           dTMAX.r_fe  = felm(dTMAX ~ 0 | dyad + date, .)$resid) %>%
    select(date, dyad, oPRCP.r_fe, oTMAX.r_fe, dPRCP.r_fe, dTMAX.r_fe, fold, n) -> rdf_uw

In [20]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe = felm(oPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           oTMAX.r_fe = felm(oTMAX ~ 0 | dyad + date, ., weights = df$n)$resid,
           dPRCP.r_fe = felm(dPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           dTMAX.r_fe = felm(dTMAX ~ 0 | dyad + date, ., weights = df$n)$resid) %>%
    select(date, dyad, oPRCP.r_fe, oTMAX.r_fe, dPRCP.r_fe, dTMAX.r_fe, fold, n) -> rdf_w

In [None]:


df %>%
    ungroup() %>%
    mutate(PRCP.r_fe = felm(PRCP ~ 0 | key + date, ., weights = df$n)$resid,
           TMAX.r_fe = felm(TMAX ~ 0 | key + date, ., weights = df$n)$resid) %>%
    select(date, key, PRCP.r_fe, TMAX.r_fe, fold, n) -> residualizer_df

folds <- list(which(folds$fold %in% 1), 
              which(folds$fold %in% 2), 
              which(folds$fold %in% 3))

write_rds(folds, str_c(fp, 'PROCESSED_DATA/dyad_folds.RDS'))
write_rds(rdf, str_c(fp, 'PROCESSED_DATA/dyad_rdf.RDS'))
write_rds(rdf, str_c(fp, 'PROCESSED_DATA/dyad_rdf.RDS'))


In [27]:
XGresidualizer <- function(Y, colname, weighted = F) {
    print(colname)
    if(weighted == F) {
        rdf_uw %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, .)$resid) -> temp_df
    } else {
        rdf_w %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, ., weights = rdf_w$n)$resid) -> temp_df
    }
    
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))

    dm    <- xgb.DMatrix(data = model.matrix(~ 0 + oPRCP.r_fe + oTMAX.r_fe + dPRCP.r_fe + dTMAX.r_fe, temp_df), label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    
    if(weighted == F) {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3)
    } else {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3, 
                      weight = temp_df$n)
    }
    
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(~ 0 + oPRCP.r_fe + oTMAX.r_fe + dPRCP.r_fe + dTMAX.r_fe, tr), label = tr$Y.r)
        if(weighted == F) {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n)
        } else {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n, 
                             weight = tr$n)
        }
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(~ 0 + oPRCP.r_fe + oTMAX.r_fe + dPRCP.r_fe + dTMAX.r_fe, te), label = te$Y.r)
        te %>%
            select(date, dyad) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        
        assign(str_c('temp',i), pred_df) %>%
            select(-pred, -date, -dyad)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, dyad) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -dyad)
    
    colnames(out) <- ifelse(weighted, str_c(colname, '.wr'), str_c(colname, '.uwr'))
    return(out)
}

In [33]:
cols_to_xgr <- colnames(
    df %>%
        select(
            ndotd, log_ndotd,
            pdotd, log_pdotd,
            matches('[od]p[123]')
         )
)
cols_to_xgr

In [34]:
xg.residuals1 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

[1] "ndotd"
[1]	train-rmse:270.778758+8.569440	test-rmse:270.409708+16.636122 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:270.762258+8.566330	test-rmse:270.394938+16.635369 
[3]	train-rmse:270.750895+8.564057	test-rmse:270.385590+16.635996 
[4]	train-rmse:270.739583+8.568831	test-rmse:270.374217+16.627849 
[5]	train-rmse:270.733968+8.569013	test-rmse:270.371908+16.626595 
[6]	train-rmse:270.727712+8.566788	test-rmse:270.369080+16.626337 
[7]	train-rmse:270.724294+8.567188	test-rmse:270.367035+16.624345 
[8]	train-rmse:270.719971+8.568180	test-rmse:270.363464+16.622680 
[9]	train-rmse:270.715424+8.567352	test-rmse:270.362335+16.619808 
[10]	train-rmse:270.710724+8.565541	test-rmse:270.361776+16.618969 
[11]	train-rmse:270.707133+8.565236	test-rmse:270.358561+16.618210 
[12]	train-rmse:270.703603+8.565902	test-rmse:270.357218+16.616020 
[13]	train-rmse:270.699545+8.565389	test-rmse:270.

In [36]:
xg.residuals2 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i], T)

[1] "ndotd"
[1]	train-rmse:271.084920+8.509796	test-rmse:270.722372+16.519955 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:271.066579+8.508181	test-rmse:270.704071+16.521532 
[3]	train-rmse:271.055491+8.506762	test-rmse:270.694672+16.519894 
[4]	train-rmse:271.047404+8.506841	test-rmse:270.687388+16.517726 
[5]	train-rmse:271.040110+8.505544	test-rmse:270.681172+16.517914 
[6]	train-rmse:271.036123+8.504538	test-rmse:270.678599+16.519898 
[7]	train-rmse:271.031911+8.503493	test-rmse:270.676168+16.518767 
[8]	train-rmse:271.028015+8.502792	test-rmse:270.673645+16.519186 
[9]	train-rmse:271.025350+8.503029	test-rmse:270.671366+16.517949 
[10]	train-rmse:271.021667+8.501973	test-rmse:270.671682+16.517639 
[11]	train-rmse:271.018839+8.501296	test-rmse:270.668843+16.517403 
[12]	train-rmse:271.016327+8.499948	test-rmse:270.667328+16.517920 
[13]	train-rmse:271.014638+8.500136	test-rmse:270.

In [42]:
df %>%
    select(date, dyad, origin_cluster, destination_cluster, n, bordering) %>%
    arrange(date, dyad) %>%
    bind_cols(xg.residuals1, xg.residuals2) -> df.r

In [43]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/dyad_xgr.RDS'))