In [1]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(doMC)
library(lfe)
library(fixest)
library(xgboost)
registerDoMC(24)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: foreach

Loading required package: iterators

Loading required package: parallel

Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack



Attaching package: ‘xgboost’


The following object is masked from ‘package:dplyr’:

    slice




In [5]:
fp    <- '/pool001/mfzhao/'
df    <- data.table::fread('/pool001/mfzhao/PROCESSED_DATA/dyadic_mvmt_2020.csv')

In [6]:
colnames(df)

In [7]:
df %>%
    filter(origin_cluster != destination_cluster) %>%
    mutate(log_ndotd = log(ndotd),
           log_pdotd = log(pdotd),
           nearby = as.numeric(dist < 100)) -> df

In [15]:
df %>%
    mutate(o  = ifelse(op3 == 1, 'p3',
                       ifelse(op2 == 1, 'p2', 
                              ifelse(op1 == 1, 'p1', 'p0'))),
           dp0 = 1) -> df.opmex

df.opmex %>%
    select(date, dyad, origin_cluster, destination_cluster, n, nearby, bordering, dist, ndotd, pdotd, 
           matches('unemp'), 
           matches('[od]prcp[01][0-9]'), 
           matches('[od]tmax[01][0-9]'), 
           matches('[od]as_')) %>%
    bind_cols(as.data.frame(model.matrix(ndotd ~ 0 + o * (dp0 + dp1 + dp2 + dp3), df.opmex))) %>%
    mutate(`op0:dp0` = op0 * dp0,
           `op0:dp1` = op0 * dp1,
           `op0:dp2` = op0 * dp2,
           `op0:dp3` = op0 * dp3) -> df.opmex

colnames(df.opmex) <- str_replace(colnames(df.opmex), ':', '_')

df.opmex %>%
    select(date, 
           dyad, 
           op1b = op1,
           op2b = op2,
           op3b = op3,
           matches('op[0123]_')) -> df.opmex

df %>%
    left_join(df.opmex) -> df

In [19]:
set.seed(2345)
df %>%
    ungroup() %>%
    select(dyad) %>%
    distinct() %>% 
    mutate(i = sample(1:n(), n(), replace = F),
           fold = i %% 3 + 1) %>%
    select(-i) -> folds

df %>%
    inner_join(folds) %>%
    arrange(date, dyad) %>%
    ungroup() -> df


Joining, by = "dyad"



In [23]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe           = feols(oPRCP ~ 0 | dyad + date, .)$resid,
           oTMAX.r_fe           = feols(oTMAX ~ 0 | dyad + date, .)$resid,
           dPRCP.r_fe           = feols(dPRCP ~ 0 | dyad + date, .)$resid,
           dTMAX.r_fe           = feols(dTMAX ~ 0 | dyad + date, .)$resid,
           oas_newcases.r_fe    = feols(oas_newcases ~ 0 | dyad + date, .)$resid,
           oas_newdeaths.r_fe   = feols(oas_newdeaths ~ 0 | dyad + date, .)$resid,
           oas_stnewcases.r_fe  = feols(oas_stnewcases ~ 0 | dyad + date, .)$resid,
           oas_stnewdeaths.r_fe = feols(oas_stnewdeaths ~ 0 | dyad + date, .)$resid,
           das_newcases.r_fe    = feols(das_newcases ~ 0 | dyad + date, .)$resid, 
           das_newdeaths.r_fe   = feols(das_newdeaths ~ 0 | dyad + date, .)$resid,
           das_stnewcases.r_fe  = feols(das_stnewcases ~ 0 | dyad + date, .)$resid,
           das_stnewdeaths.r_fe = feols(das_stnewdeaths ~ 0 | dyad + date, .)$resid,
           oct_unemp_rate.r_fe  = feols(oct_unemp_rate ~ 0 | dyad + date, .)$resid,
           ost_unemp_rate.r_fe  = feols(ost_unemp_rate ~ 0 | dyad + date, .)$resid,
           dct_unemp_rate.r_fe  = feols(dct_unemp_rate ~ 0 | dyad + date, .)$resid,
           dst_unemp_rate.r_fe  = feols(dst_unemp_rate ~ 0 | dyad + date, .)$resid) %>%
    select(date, dyad, matches('r_fe'), fold, n) -> rdf_uw

In [24]:
df %>%
    ungroup() %>%
    mutate(oPRCP.r_fe           = feols(oPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           oTMAX.r_fe           = feols(oTMAX ~ 0 | dyad + date, ., weights = df$n)$resid,
           dPRCP.r_fe           = feols(dPRCP ~ 0 | dyad + date, ., weights = df$n)$resid,
           dTMAX.r_fe           = feols(dTMAX ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_newcases.r_fe    = feols(oas_newcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_newdeaths.r_fe   = feols(oas_newdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_stnewcases.r_fe  = feols(oas_stnewcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           oas_stnewdeaths.r_fe = feols(oas_stnewdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_newcases.r_fe    = feols(das_newcases ~ 0 | dyad + date, ., weights = df$n)$resid, 
           das_newdeaths.r_fe   = feols(das_newdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_stnewcases.r_fe  = feols(das_stnewcases ~ 0 | dyad + date, ., weights = df$n)$resid,
           das_stnewdeaths.r_fe = feols(das_stnewdeaths ~ 0 | dyad + date, ., weights = df$n)$resid,
           oct_unemp_rate.r_fe  = feols(oct_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           ost_unemp_rate.r_fe  = feols(ost_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           dct_unemp_rate.r_fe  = feols(dct_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid,
           dst_unemp_rate.r_fe  = feols(dst_unemp_rate ~ 0 | dyad + date, ., weights = df$n)$resid) %>%
    select(date, dyad, matches('r_fe'), fold, n) -> rdf_w

In [25]:
XGresidualizer <- function(Y, colname, weighted = F) {
    print(colname)
    if(weighted == F) {
        rdf_uw %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, .)$resid) -> temp_df
    } else {
        rdf_w %>%
            ungroup() %>% 
            mutate(Y = Y,
                   Y.r = felm(Y ~ 0 | dyad + date, ., weights = rdf_w$n)$resid) -> temp_df
    }
    
    folds <- list(which(temp_df$fold %in% 1), 
                  which(temp_df$fold %in% 2), 
                  which(temp_df$fold %in% 3))

    form <- str_c('~ 0 + oPRCP.r_fe + oTMAX.r_fe + dPRCP.r_fe + dTMAX.r_fe + ',
                  'oas_newcases.r_fe + oas_newdeaths.r_fe + oas_stnewcases.r_fe + oas_stnewdeaths.r_fe + ',
                  'das_newcases.r_fe + das_newdeaths.r_fe + das_stnewcases.r_fe + das_stnewdeaths.r_fe + ',
                  'oct_unemp_rate.r_fe + ost_unemp_rate.r_fe + dct_unemp_rate.r_fe + dst_unemp_rate.r_fe')
    form <- as.formula(form)
    
    
    dm    <- xgb.DMatrix(data = model.matrix(form, temp_df), label = temp_df$Y.r)
    param <- list(max_depth=2, eta=.5, silent=1, objective='reg:linear')
    
    if(weighted == F) {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3)
    } else {
        fit <- xgb.cv(params = param, 
                      data = dm, 
                      folds = folds,
                      nrounds = 100, 
                      early_stopping_rounds = 3, 
                      weight = temp_df$n)
    }
    
    best_n <- fit$best_iteration
    for (i in 1:3) {
        tr  <- temp_df %>% filter(fold != i)
        trm <- xgb.DMatrix(data = model.matrix(form, tr), label = tr$Y.r)
        if(weighted == F) {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n)
        } else {
            fit <- xgb.train(params = param, 
                             data = trm, 
                             nrounds = best_n, 
                             weight = tr$n)
        }
        te  <- temp_df %>% filter(fold == i)
        tem <- xgb.DMatrix(data = model.matrix(form, te), label = te$Y.r)
        te %>%
            select(date, dyad) %>%
            mutate(pred = predict(fit, newdata = tem)) -> pred_df
        
        assign(str_c('temp',i), pred_df) %>%
            select(-pred, -date, -dyad)
    }
    out <- bind_rows(temp1, temp2, temp3) %>%
        arrange(date, dyad) %>%
        mutate(tempname = temp_df$Y.r - pred) %>%
        select(-pred, -date, -dyad)
    
    colnames(out) <- ifelse(weighted, str_c(colname, '.wr'), str_c(colname, '.uwr'))
    return(out)
}

In [26]:
cols_to_xgr <- colnames(
    df %>%
        select(
            log_ndotd,
            log_pdotd,
            matches('[od]p[123]'),
            -matches(':')
         )
)
cols_to_xgr

In [27]:
xg.residuals1 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i])

[1] "log_ndotd"
[1]	train-rmse:0.607877+0.000464	test-rmse:0.607892+0.000996 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.564059+0.000442	test-rmse:0.564081+0.001009 
[3]	train-rmse:0.551795+0.000491	test-rmse:0.551832+0.001037 
[4]	train-rmse:0.547968+0.000460	test-rmse:0.548017+0.001005 
[5]	train-rmse:0.545722+0.000338	test-rmse:0.545805+0.001221 
[6]	train-rmse:0.544749+0.000407	test-rmse:0.544857+0.001172 
[7]	train-rmse:0.543958+0.000524	test-rmse:0.544139+0.001198 
[8]	train-rmse:0.543344+0.000546	test-rmse:0.543531+0.001264 
[9]	train-rmse:0.542709+0.000499	test-rmse:0.542977+0.001173 
[10]	train-rmse:0.541625+0.000529	test-rmse:0.541853+0.000923 
[11]	train-rmse:0.541338+0.000522	test-rmse:0.541551+0.000911 
[12]	train-rmse:0.540942+0.000463	test-rmse:0.541185+0.000986 
[13]	train-rmse:0.540632+0.000464	test-rmse:0.540921+0.000988 
[14]	train-rmse:0.540352+0.000472	test-rmse

In [28]:
xg.residuals2 <- foreach(i = 1:length(cols_to_xgr), .combine = cbind) %do% XGresidualizer(df[[cols_to_xgr[i]]], cols_to_xgr[i], T)

[1] "log_ndotd"
[1]	train-rmse:0.611219+0.000350	test-rmse:0.611227+0.000790 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 3 rounds.

[2]	train-rmse:0.567407+0.000340	test-rmse:0.567428+0.000722 
[3]	train-rmse:0.554544+0.000344	test-rmse:0.554585+0.000679 
[4]	train-rmse:0.550291+0.000357	test-rmse:0.550373+0.000688 
[5]	train-rmse:0.548618+0.000372	test-rmse:0.548752+0.000742 
[6]	train-rmse:0.547562+0.000370	test-rmse:0.547737+0.000713 
[7]	train-rmse:0.546852+0.000506	test-rmse:0.546993+0.000613 
[8]	train-rmse:0.546207+0.000464	test-rmse:0.546273+0.000653 
[9]	train-rmse:0.545766+0.000477	test-rmse:0.545877+0.000713 
[10]	train-rmse:0.545334+0.000461	test-rmse:0.545530+0.000760 
[11]	train-rmse:0.544831+0.000418	test-rmse:0.545025+0.000798 
[12]	train-rmse:0.544474+0.000377	test-rmse:0.544645+0.000863 
[13]	train-rmse:0.544126+0.000472	test-rmse:0.544335+0.000804 
[14]	train-rmse:0.543753+0.000448	test-rmse

In [29]:
df %>%
    select(date, dyad, origin_cluster, destination_cluster, n, bordering) %>%
    arrange(date, dyad) %>%
    bind_cols(xg.residuals1, xg.residuals2) -> df.r

In [30]:
write_rds(df.r, str_c(fp, 'PROCESSED_DATA/dyad_xgr.RDS'))

In [31]:
df.r

date,dyad,origin_cluster,destination_cluster,n,bordering,log_ndotd.uwr,log_pdotd.uwr,op1.uwr,op2.uwr,⋯,op3_dp1.wr,op1_dp2.wr,op2_dp2.wr,op3_dp2.wr,op1_dp3.wr,op2_dp3.wr,op3_dp3.wr,op0_dp1.wr,op0_dp2.wr,op0_dp3.wr
<date>,<chr>,<int>,<int>,<int>,<int>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",⋯,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>"
2020-01-01,01001->13121,1,13,55601,0,1.26096543,1.24277330,0.0048227471,-0.0022521723,⋯,0.0003751259,-0.006868438,-0.0028183387,-0.001056088,-6.038963e-04,-5.283959e-03,-0.0344653193,0.0046332998,-7.803778e-05,-4.768372e-07
2020-01-01,01003->04013,1,4,218022,0,0.36521420,0.42720083,-0.0082965332,0.0009302687,⋯,-0.0102008838,-0.015558409,0.0109410061,0.020999895,-1.473417e-05,8.278414e-04,0.0286925693,-0.0132891917,1.469399e-04,-4.768372e-07
2020-01-01,01003->12005,1,12,218022,0,-0.72851321,-0.61843276,-0.0036516625,-0.0108775307,⋯,-0.0144891996,-0.009258406,-0.0248722361,0.009046303,8.982302e-04,-1.793007e-02,0.0202669196,-0.0225691882,-4.811625e-05,-4.768372e-07
2020-01-01,01003->12031,1,12,218022,0,-0.77411573,-0.63932041,-0.0036516625,-0.0108775307,⋯,-0.0163278241,-0.009258406,-0.0127822982,0.009046303,6.673218e-04,-1.793007e-02,0.0202669196,-0.0225691882,4.051585e-05,-4.768372e-07
2020-01-01,01003->12033,1,12,218022,1,-0.57837423,-0.50101124,-0.0018340487,0.0116419386,⋯,0.0063883405,-0.024731891,-0.0139155017,0.027881311,-3.890513e-04,-2.392278e-02,0.0254476957,-0.0166087535,-1.026247e-04,-4.768372e-07
2020-01-01,01003->12073,1,12,218022,0,-0.25259137,-0.12966212,-0.0036516625,-0.0246764232,⋯,-0.0144891996,-0.009258406,-0.0248722361,0.006600248,6.673218e-04,-2.502988e-02,0.0202669196,-0.0225691882,-1.723025e-04,-4.768372e-07
2020-01-01,01003->12091,1,12,218022,0,-0.15401351,-0.08122756,0.0071520072,-0.0063111890,⋯,0.0064412694,-0.017154501,-0.0009023474,0.010449992,-4.331290e-04,-1.937390e-02,0.0004459016,-0.0178438511,7.524683e-06,-4.768372e-07
2020-01-01,01003->12095,1,12,218022,0,0.98139250,1.01041785,-0.0036516625,-0.0108775307,⋯,-0.0163278241,-0.009258406,-0.0130338298,0.008125709,5.016805e-04,-1.619119e-02,0.0185441069,-0.0225691882,-1.003299e-04,-4.768372e-07
2020-01-01,01003->12113,1,12,218022,0,-0.38823116,-0.31488618,0.0071520072,-0.0063111890,⋯,0.0109330755,-0.017154501,-0.0044451879,0.010449992,-2.070784e-04,-3.711335e-02,0.0004459016,-0.0190766838,7.524683e-06,-4.768372e-07
2020-01-01,01003->12131,1,12,218022,0,0.54065307,0.66097837,-0.0036516625,-0.0108775307,⋯,-0.0144891996,-0.009258406,-0.0251237677,0.009046303,8.982302e-04,-1.793007e-02,0.0202669196,-0.0225691882,-1.000021e-04,-4.768372e-07
