In [1]:
import numpy as np 
import pandas as pd
import csv
from scipy.optimize import fsolve
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib 
from cycler import cycler
from matplotlib.ticker import MaxNLocator
import os
import plotly.express as px
import torch
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()

from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
import re
import utils
import Rsuperlearner
import training

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import dcor

import cvxpy as cp


Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.

[MLENS] backend: threading


In [2]:
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()


In [3]:
n, p = 1000, 10

x = np.random.normal(0, .5, size=(n, p))

alpha = np.random.uniform(.2, .5, size=(p, 1))
beta = np.random.uniform(.2, .5, size=(p, 1))

xalpha = np.dot(x, alpha)
xbeta = np.dot(x, beta)

pr = 1/(1 + np.exp(-xalpha))
A = np.random.binomial(1, pr, size=(n, 1))

y = -1 + A + xbeta + np.random.normal(0, 1, size=(n, 1))


In [52]:
def save_temporary_data_for_R(yhat1_df, yhat0_df, ps_df, A, y, file_name):
    
    n = A.shape[0]
    
    data = pd.DataFrame(columns=['A', 'y'], index=[c for c in range(n)])
    
    data['A'] = A
    data['y'] = y
    
    data.to_csv('A_y-' + file_name + '.csv', index=False)
    ps_df.to_csv('ps-' + file_name + '.csv', index=False)
    yhat1_df.to_csv('yhat1-' + file_name + '.csv', index=False)
    yhat0_df.to_csv('yhat0-' + file_name + '.csv', index=False)
        
    return

def read_temporary_data_in_R(file_name):
    
    _ = ro.r(f"A_y_data <- read.csv('A_y-{file_name}.csv')")
    _ = ro.r(f"ps_data <- read.csv('ps-{file_name}.csv')")
    _ = ro.r(f"yhat1_data <- read.csv('yhat1-{file_name}.csv')")
    _ = ro.r(f"yhat0_data <- read.csv('yhat0-{file_name}.csv')")

    _ = ro.r('A = A_y_data[, 1]')
    _ = ro.r('y = A_y_data[, 2]')
    _ = ro.r('ps = ps_data')
    _ = ro.r('yhat1 = yhat1_data')
    _ = ro.r('yhat0 = yhat0_data')
    
    return


In [58]:
# g.hat <- matrix(0, n, g.col)

# Qs = do.call(cbind, Qs)
# ps = do.call(cbind, ps)
# g.hat = cbind(Qs, ps)

def run_MR_in_R():
    _ = ro.r("""
    MREst.mean <- function(y, A, Qs, ps)
    {
        
        J <- NCOL(ps)
        K <- NCOL(Qs)
        g.col <- J + K 
        n <- length(y)

        m <- sum(A) # number of observed subjects
        
        g.hat = cbind(Qs, ps)
        g.hat <- scale(g.hat, center = TRUE, scale = FALSE)[A == 1, ]
        
        # define the function to be minimized
        Fn <- function(rho, ghat){ -sum(log(1 + ghat %*% rho)) }
        Grd <- function(rho, ghat){ -colSums(ghat / c(1 + ghat %*% rho)) }
        
        # calculate the weights
        rho.hat <- constrOptim(theta = rep(0, g.col), 
                             f = Fn, 
                             grad = Grd, 
                             ui = g.hat, 
                             ci = rep(1 / m - 1, m), 
                             ghat = g.hat
                             )$par
        wts <- c(1 / m / (1 + g.hat %*% rho.hat))
        wts <- wts / sum(wts)
        estimate <- sum(y[A == 1] * wts)
        return(list(estimate = estimate, weights = wts))

    }
    """)
    return
    

In [53]:
def MR(y, A, yhat1_df, yhat0_df, ps_df, temporary_csv_name):
    
    run_MR_in_R()
    
    save_temporary_data_for_R(yhat1_df, yhat0_df, ps_df, A, y, file_name=temporary_csv_name)
    read_temporary_data_in_R(temporary_csv_name)
    
    uuu = importr("MultiRobust")
    ro.r('library("MultiRobust")')

    ro.r('result <- MREst.mean(y=y, A=A, Qs=yhat1, ps=ps)')
    beta1 = np.array(ro.r("result$estimate"))[0]

    ro.r('result <- MREst.mean(y=y, A=1-A, Qs=yhat0, ps=1-ps)')
    beta0 = np.array(ro.r("result$estimate"))[0]

    return beta1 - beta0

In [59]:
MR(y=data[['y']], 
   A=data[['A']], 
   yhat1_df=data[['oracle_yhat1', 'nnY_H_300_300_300_L1_0.01_L1TG_0.0_yhat1']],
   yhat0_df=data[['oracle_yhat0', 'nnY_H_300_300_300_L1_0.01_L1TG_0.0_yhat0']],
   ps_df=data[['ps_oracle', 'ps_nnA_H_300_300_300_L1_0.01_L1TG_0.0']],
   temporary_csv_name='temporary-MR-data')


1.0323983886865937

In [9]:
from IPython.display import display

import importlib
importlib.reload(utils)
import importlib
importlib.reload(training)

temp_results1 = []

r3 = 0.25
r4 = 0.25
r1, r2 = 0, 0

nonlin_portion = .3
True_TE = 1

for n, p in [(7500, 300), (750, 32)]:
    
    covariates_colnames = ['x' + str(jj) for jj in range(p)]
    
    for niter in range(1, 2):

        filename = "/home/mr/PhD/Causality in AI/Sim2021/featureImp_dim{}_{}_r_{}_{}/KfoldQ_ps_nns{}.csv"\
                                                                                    .format(n, p, r3, r4, niter)

        data = pd.read_csv(filename)
        print(data.shape)
        break
    break

(7500, 402)


In [92]:
MR(y=data['y'], 
   A=data['A'], 
   yhat1_df=data[[c for c in data.columns if 'yhat1' in c and 'oracle' not in c]],
   yhat0_df=data[[c for c in data.columns if 'yhat0' in c and 'oracle' not in c]], 
   ps_df=data[[c for c in data.columns if 'ps' in c and 'oracle' not in c]], 
   temporary_csv_name='temporary-MR-data')

3.14733046735185

In [87]:
def mr_bootstrap(y, A, yhat1_df, yhat0_df, ps_df, temporary_csv_name, b=1000):
    
    N = yhat1_df.shape[1]
    estimates = []
    
    for i in range(b):
        
        random_cols_yhat1 = random.choices(list(yhat1_df.columns), k=N)
        random_cols_yhat0 = random.choices(list(yhat0_df.columns), k=N)
        random_cols_ps = random.choices(list(ps_df.columns), k=N)
        
        estimates += [MR(y=y, 
                         A=A, 
                         yhat1_df=yhat1_df[random_cols_yhat1],
                         yhat0_df=yhat0_df[random_cols_yhat0], 
                         ps_df=ps_df[random_cols_ps], 
                         temporary_csv_name='temporary-MR-data')
                     ]
    estimates = pd.Series(estimates)
    
    return estimates.quantile(.025), estimates.quantile(.975)


In [None]:
mr_boot = mr_bootstrap(y=data['y'], 
                       A=data['A'], 
                       yhat1_df=data[[c for c in data.columns if 'yhat1' in c and 'oracle' not in c][:5]],
                       yhat0_df=data[[c for c in data.columns if 'yhat0' in c and 'oracle' not in c][:5]], 
                       ps_df=data[[c for c in data.columns if 'ps' in c and 'oracle' not in c][:5]], 
                       temporary_csv_name='temporary-MR-data',
                       b=1000
                      )