In [29]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(fixest)
library(xgboost)
registerDoMC(24)

In [21]:
fp    <- '/pool001/mfzhao/'
df    <- data.table::fread('/pool001/mfzhao/PROCESSED_DATA/dyadic_mvmt_2020.csv')

In [22]:
colnames(df)

In [23]:
df %>%
    mutate(log_ndotd = log(ndotd),
           log_pdotd = log(pdotd)) -> df

In [24]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(dyad) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, dyad) %>%
    ungroup() -> df


Joining, by = "dyad"



In [None]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe           = feols(oPRCP ~ 0 | dyad + date, .)$resid,
           oTMAX.r_fe           = feols(oTMAX ~ 0 | dyad + date, .)$resid,
           dPRCP.r_fe           = feols(dPRCP ~ 0 | dyad + date, .)$resid,
           dTMAX.r_fe           = feols(dTMAX ~ 0 | dyad + date, .)$resid,
           oas_newcases.r_fe    = feols(oas_newcases ~ 0 | dyad + date, .)$resid,
           oas_newdeaths.r_fe   = feols(oas_newdeaths ~ 0 | dyad + date, .)$resid,
           oas_stnewcases.r_fe  = feols(oas_stnewcases ~ 0 | dyad + date, .)$resid,
           oas_stnewdeaths.r_fe = feols(oas_stnewdeaths ~ 0 | dyad + date, .)$resid,
           das_newcases.r_fe    = feols(das_newcases ~ 0 | dyad + date, .)$resid, 
           das_newdeaths.r_fe   = feols(das_newdeaths ~ 0 | dyad + date, .)$resid,
           das_stnewcases.r_fe  = feols(das_stnewcases ~ 0 | dyad + date, .)$resid,
           das_stnewdeaths.r_fe = feols(das_stnewdeaths ~ 0 | dyad + date, .)$resid,
           oct_unemp_rate.r_fe  = feols(oct_unemp_rate ~ 0 | dyad + date, .)$resid,
           ost_unemp_rate.r_fe  = feols(ost_unemp_rate ~ 0 | dyad + date, .)$resid,
           dct_unemp_rate.r_fe  = feols(dct_unemp_rate ~ 0 | dyad + date, .)$resid,
           dst_unemp_rate.r_fe  = feols(dst_unemp_rate ~ 0 | dyad + date, .)$resid) %>%
    select(date, dyad, matches('r_fe'), fold, n) -> rdf_uw

In [None]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe           = feols(oPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           oTMAX.r_fe           = feols(oTMAX ~ 0 | dyad + date, ., weights = df$n)$resid,
           dPRCP.r_fe           = feols(dPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           dTMAX.r_fe           = feols(dTMAX ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_newcases.r_fe    = feols(oas_newcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_newdeaths.r_fe   = feols(oas_newdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_stnewcases.r_fe  = feols(oas_stnewcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_stnewdeaths.r_fe = feols(oas_stnewdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_newcases.r_fe    = feols(das_newcases ~ 0 | dyad + date, ., weights = df$n)$resid, 
           das_newdeaths.r_fe   = feols(das_newdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_stnewcases.r_fe  = feols(das_stnewcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_stnewdeaths.r_fe = feols(das_stnewdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           oct_unemp_rate.r_fe  = feols(oct_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           ost_unemp_rate.r_fe  = feols(ost_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           dct_unemp_rate.r_fe  = feols(dct_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           dst_unemp_rate.r_fe  = feols(dst_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid) %>%
    select(date, dyad, matches('r_fe'), fold, n) -> rdf_w

In [27]:
XGresidualizer <- function(Y, colname, weighted = F) {
    print(colname)
    if(weighted == F) {
        rdf_uw %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, .)$resid) -> temp_df
    } else {
        rdf_w %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, ., weights = rdf_w$n)$resid) -> temp_df
    }
    
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))

    form <- str_c('~ 0 + oPRCP.r_fe + oTMAX.r_fe + dPRCP.r_fe + dTMAX.r_fe + ',
                  'oas_newcases.r_fe + oas_newdeaths.r_fe + oas_stnewcases.r_fe + oas_stnewdeaths.r_fe + ',
                  'das_newcases.r_fe + das_newdeaths.r_fe + das_stnewcases.r_fe + das_stnewdeaths.r_fe + ',
                  'oct_unemp_rate.r_fe + ost_unemp_rate.r_fe + dct_unemp_rate.r_fe + dst_unemp_rate.r_fe')
    form <- as.formula(form)
    
    
    dm    <- xgb.DMatrix(data = model.matrix(form, temp_df), label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    
    if(weighted == F) {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3)
    } else {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3, 
                      weight = temp_df$n)
    }
    
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(form, tr), label = tr$Y.r)
        if(weighted == F) {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n)
        } else {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n, 
                             weight = tr$n)
        }
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(form, te), label = te$Y.r)
        te %>%
            select(date, dyad) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        
        assign(str_c('temp',i), pred_df) %>%
            select(-pred, -date, -dyad)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, dyad) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -dyad)
    
    colnames(out) <- ifelse(weighted, str_c(colname, '.wr'), str_c(colname, '.uwr'))
    return(out)
}

In [None]:
cols_to_xgr <- colnames(
    df %>%
        select(
            log_ndotd,
            log_pdotd,
            matches('[od]p[123]')
         )
)
cols_to_xgr

In [None]:
xg.residuals1 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

In [None]:
xg.residuals2 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i], T)

In [None]:
df %>%
    select(date, dyad, origin_cluster, destination_cluster, n, bordering) %>%
    arrange(date, dyad) %>%
    bind_cols(xg.residuals1, xg.residuals2) -> df.r

In [None]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/dyad_xgr.RDS'))