# Implementation of algorithm from Diaz and van der Laan

In [1]:
library(condensier)

condensier
The condensier package is still in beta testing. Interpret results with caution.


In [2]:
rm(list = ls())
set.seed(429153)
n_obs <- 100
n_w <- 3

In [3]:
# simulate simple data for tmle-shift sketch
W <- replicate(n_w, rnorm(n_obs))
A <- rowSums(cos(exp(W)) + W)
Y <- sin(A)
O <- as.data.frame(cbind(W,A,Y))
colnames(O) <- c(paste0("W", seq_len(n_w)), "A", "Y")
head(O)

W1,W2,W3,A,Y
1.3357904,-1.2142,0.7023165,0.5581855,0.529648
0.5852906,1.040991,-0.7216398,0.6135768,0.5757955
1.3420567,-1.848015,-1.0266263,-0.3826661,-0.3733951
0.73432,-1.726326,-0.5078887,-0.1823722,-0.181363
-0.1484268,-1.520334,1.2997251,0.3935576,0.3834764
1.7497752,1.465439,-0.608678,3.951361,-0.7241274


## utility functions

In [35]:
bound_precision <- function(values_scaled) {
    if (max(values_scaled) > 1 | min(values_scaled) < 0) {
        stop("Scaled values are not in the interval [0, 1].")
    }
    values_scaled[values_scaled == 0] <- .Machine$double.neg.eps
    values_scaled[values_scaled == 1] <- 1 - .Machine$double.neg.eps
    return(values_scaled)
}

In [41]:
bound_scaling <- function(Y, preds_scaled = NULL,
                          scale = c("zero_one", "original")) {
    y_min <- min(Y)
    y_max <- max(Y)
    
    if (scale == "zero_one") {
        y_star <- (Y - y_min) / (y_max - y_min)
        return(y_star)
    } else if (scale == "original" & !is.null(preds_scaled)) {
        preds_original <- (y_max - y_min) * preds_scaled + y_min
        return(preds_original)
    }
}

## functions for treatment shift $d(a,w)$

In [4]:
tx_shift_g <- function(a, w = NULL, delta, type = "additive") {
    if (type == "additive") {
        a_shift <- A - delta
    }
    return(a_shift)
}

In [5]:
tx_shift_Q <- function(a, w = NULL, delta, type = "additive") {
    if (type == "additive") {
        a_shift <- A + delta
    }
    return(a_shift)
}

## function for estimating $g_n$

In [24]:
est_g <- function(A, W, delta = 0, ...) {
    # make data object
    data_O <- as.data.frame(cbind(A, W))
    colnames(data_O) <- c("A", paste0("W", seq_len(ncol(W))))
    
    # fit conditional density with condensier
    fit_g_A <- fit_density(X = c(paste0("W", seq_len(ncol(W)))),
                           Y = "A", input_data = data_O, ...)

    # predict probabilities for the un-shifted data (A = a)
    pred_g_A <- predict_probability(model_fit = fit_g_A, newdata = data_O)

    # predict probabilities for the shifted data (A = a - delta)
    data_O_shifted <- data_O
    data_O_shifted$A <- tx_shift_g(a = data_O_shifted$A, delta = delta)
    pred_g_A_shifted <- predict_probability(model_fit = fit_g_A,
                                            newdata = data_O_shifted)

    # create output matrix: scenarios A = a, A = a - delta
    out <- as.data.frame(cbind(pred_g_A, pred_g_A_shifted))
    colnames(out) <- c("gn_unshifted", "gn_shifted")
    rownames(out) <- NULL
    return(out)
}

testing function for estimating $g_n$

In [25]:
test_est_g <- est_g(A = A, W = W, delta = 0.5,
                    nbins = 20, bin_method = "equal.mass",
                    bin_estimator = speedglmR6$new())

In [26]:
head(test_est_g)

gn_unshifted,gn_shifted
2.6638932,0.03903669
2.3326707,0.013664372
0.2385218,0.002145538
10.700063,0.001911927
0.7127472,0.02059011
0.5335457,0.533545681


## function for estimating $Q_n$

In [42]:
est_Q <- function(Y, A, W, delta = 0, reg_form = "Y ~ .") {
    # scale the outcome for the logit transform
    y_star <- bound_scaling(Y = Y, scale = "zero_one")
    
    # make data object but using y_star rather than raw outcome
    data_O <- as.data.frame(cbind(y_star, A, W))
    colnames(data_O) <- c("Y", "A", paste0("W", seq_len(ncol(W))))
    data_O_shifted <- data_O

    # obtain a model fit for the outcome regression
    fit_Qn <- glm(as.formula(reg_form), data = data_O)

    # predict probabilities for the un-shifted data (A = a)
    pred_star_Qn <- predict(fit_Qn, newdata = data_O)

    # predict probabilities for the shifted data (A = a + delta)
    data_O_shifted$A <- tx_shift_Q(a = data_O_shifted$A, delta = delta)
    pred_star_Qn_shifted <- predict(fit_Qn, newdata = data_O_shifted)
    
    # avoid values that are exactly 0 or 1 in the scaled Qn and Qn_shifted
    pred_star_Qn <- bound_precision(values_scaled = pred_star_Qn)
    pred_star_Qn_shifted <- bound_precision(values_scaled = pred_star_Qn_shifted)

    # create output matrix: scenarios A = a, A = a - delta
    out <- as.data.frame(cbind(pred_star_Qn, pred_star_Qn_shifted))
    colnames(out) <- c("Qn_unshifted", "Qn_shifted")
    rownames(out) <- NULL
    return(out)
}

In [43]:
test_est_Q <- est_Q(Y = Y, A = A, W = W, delta = 0.5)

In [46]:
head(test_est_Q)

Qn_unshifted,Qn_shifted
0.8140587,0.7936936
0.699648,0.679283
0.6275849,0.6072198
0.627234,0.6068689
0.7341385,0.7137734
0.7059962,0.6856311


## function for estimating $H_n$

In [47]:
est_h <- function(gn, a = NULL, w = NULL) {
    # compute upper and lower limits for treatment
    #...
    #...
    
    # compute the ratio of the propensity scores
    ratio_g <- gn[, 2] / gn[, 1]
    
    # modify the ratio of the propensity scores
    # based on the indicators for shifting
    #ind_a <- ...
    #ind_a_delta <- ...
    #h_n <- ind_a * ratio_g + ind_a_delta
    
    # TODO: consider case where there is not support everywhere
    # that is, when the indicators kick in -- ignored for now...
    hn <- ratio_g
    
    # output
    return(hn)
}

In [48]:
test_est_h <- est_h(gn = test_est_g)

In [49]:
head(test_est_h)

## function for fluctuation procedure

In [None]:
est_fluc <- function(Y, Qn_scaled, Hn,
                     method = c("standard", "weighted")) {
    # scale the outcome for the logit transform
    y_star <- bound_scaling(Y = Y, scale = "zero_one")
    
    # transform the predictions for the unshifted data back to the original scale
    Qn_star_unshifted <- bound_scaling(Y = Y, preds_scaled = Qn_scaled$Qn_unshifted,
                                       scale = "original")
    
    # extract Q and obtain logit transform
    logit_Qn <- qlogis(Qn_star_unshifted)
    
    # fit the fluctuation regression in one of two ways
    if (method == "standard") {
        # note that \epsilon_n will be the coefficient of the covariate Hn
        mod_fluc <- glm(y_star ~ -1 + offset(logit_Qn) + Hn,
                        family = "binomial")
    } else if (method == "weighted") {
        # note that \epsilon_n will be the intercept term here (?)
        mod_fluc <- glm(y_star ~ offset(logit_Qn),
                        weights = Hn,
                        family = "binomial")
    }
   
    # return the fit model object
    out <- list(fluc_fit = mod_fluc, covar_method = method)
    return(out)
}

## 1-TMLE procedure

In [None]:
tmle_shifttx <- function(fluc_fit, Qn, Hn, Y) {
    # get Qn(d(A,W)) by unscaling the shifted Qn
    Qn_shifted <- bound_scaling(Y = Y, preds_scaled = Qn_scaled$Qn_shifted,
                                scale = "original")
    
    # get Qn_star for the shifted data
    Qn_star_shifted <- predict(fit = fluc_fit, newdata = data.frame(Qn_shifted),
                               type = "response")
    
    # compute the 1-TMLE
    psi <- mean(Qn_star_shifted)
    return(psi)   
}

## EIF procedure

In [None]:
eif_shifttx <- function(Y, Qn_scaled, Hn, Psi) {
    # ...
    Qn_unshifted <- bound_scaling(Y = Y, preds_scaled = Qn_scaled$Qn_unshifted,
                                  scale = "original")
    Qn_shifted <- bound_scaling(Y = Y, preds_scaled = Qn_scaled$Qn_shifted,
                                scale = "original")
    
    # ...
    eif <- Hn * (Y - Qn_unshifted) + Qn_shifted - Psi
    
    # compute the variance based on the EIF
    var_eif <- mean(eif^2)
    
    # return the variance and the EIF vector
    out <- list(var_psi = var_eif, eif = eif)
    return(out)
}

---

# Anatomy of the shift-Tx package

The algorithm is based on @diaz2017stochastic.

## Starting Assumptions

1. Start with a simple additive shift -- i.e., $d(a,w) = a + \delta$ if $a <
    u(w) - \delta$ or $d(a,w) = a$ if $a \geq u(w) - \delta$.
2. The additive shift will have _support everywhere_ -- i.e., $a < u(w)$ is true
    everywhere.
3. The data structure that we know and love $O = (W,A,Y)$.

## Functions Needed

* estimate $g_n(W)$
* estimate $Q_n(A, W)$
* estimate auxiliary covariate $H_n(A_i, W_i)$
* fluctuation procedure
* 1-TMLE procedure
* EIF procedure

## Estimate $g_n(W)$

* _input_: W, a
* _output_: a 2-column matrix, with columns for $g_n(A_i - \delta \mid W_i)$ and
    $g_n(A_i \mid W_i)$
* in the inputs $a$ is the additive shift
* use the __fit_density__ function from Oleg's __condensier__ package, need to
    use __predict_prob__ function twice: once for $A_i - \delta$ and once for
    $A_i$

## Estimate $Q_n(A, W)$

* _input_: W, a
* _output_: a 2-column matrix, with columns for $\bar{Q}_n(A_i, W_i)$ and
    $\bar{Q}_n(A_i + \delta, W_i)$

## Estimate $H_n(A_i, W_i)$

* _input_: matrix output produced by $g_n(w)$
* _output_: vector (possibly shifted) of the form described in the eqn below
* $H(a,w) = I(a < u(w)) \frac{g_0(a - \delta \mid w)}{g_0(a \mid w)} + I(a
    \geq u(w) - \delta)$
* By our assumption (2) above -- that we have _support everywhere_ -- we reduce
    the above formulation
* That is, we assume that $I(a < u(w)) = 1$ and $I(a \geq u(w) - \delta) = 0$
* Thus the form of the covariate reduces simply to $H(a,w) = \frac{g_0(a -
    \delta \mid w)}{g_0(a \mid w)}$

## Fluctuation Procedure

* _input_: matrix output from $Q_n(a,w)$, vector output of $H_n$, vector Y
* _output_: model fit object produced from a call to `glm` or `SuperLearner`
* We have the fluctuation model: $logit \bar{Q}_{\epsilon, n}(a,w) =
    logit(\bar{Q}_n(a,w)) + \epsilon \cdot H_n(a,w)$
* Note that the first term on the RHS of the above equation is one of the
    columns generated as output by the function to estimate $Q_n(A,W)$
* this could be fit with R code like the following `glm(Y ~ -1 +
    offset(logitQn_AW) + Hn_AW, family = "binomial")`, from which we may extract
    the coefficient, which is $\epsilon_n$ from the above

## 1-TMLE Procedure

* _input_: model fit object produced by the fluctuation procedure above, matrix
    produced by procedure to estimate $Q_n(A,W)$
* _output_: numeric scalar for the mean of $\bar{Q}^*_n$
* note that we have $\psi_n = \frac{1}{n} \sum_{i=1}^n \bar{Q}_n^*(d(A_i, W_i),
    W_i)$
* we obtain $\bar{Q}_n^*$ by calling the appropriate method of predict on the
    shifted data -- i.e., `predict(fit, newdata = data.frame(Qn_dAW), type =
    "response"` (note that use of 'response' performs the `expit()` transform).
* compute the $\psi_n$ as the mean of the vector produced by calling `predict`
    on the fit object, as described above

## EIF Procedure

* _input_: matrix produced by $Q^*$: a 2-column matrix, with columns for
    $\bar{Q}_n(A_i, W_i)$ and $\bar{Q}_n(A_i + \delta, W_i)$
* _output_: scalar, the variance of the efficient influence function
* note that we have the _efficient influence function_ (EIF): $D(P)(o) =
    H(a,w)(y - \bar{Q}(a,w)) + \bar{Q}(d(a,w)) - \psi(P)$
* to compute the EIF from the above, we may set up a function like the following
    `eif <- function(Y, H, Qn_AW, Qn_dAW, Psi)`, which can then compute $\psi$
    by calling 1-TMLE (alternatively, the mean of the vector `Qn_dAW`) and then
    using the formula above
* compute $\sigma^2_n = \frac{1}{n}(EIF^2)$, that is simply call mean on the
    vector produced by the above