In [None]:
import numpy as np
import math
import pickle
import itertools
import time
import datetime
import cvxpy as cvx
import mosek
import copy

import mkl
import pickle
import os
import ray
import warnings
warnings.filterwarnings("ignore")

import subprocess
subprocess.call('bash convert_files.sh', shell=True)
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from numpy import transpose as trans
from auxiliary import is_pos_def, cond, rotate_matrix, gen_train_data, gen_test_data

In [None]:
ray.init(object_store_memory=int(7e10), num_cpus=48,  redis_password="password54322423")

In [None]:
def fit_lasso(train_data, sigma, cv, fit_intercept, alpha_scaling, n_folds=5):
    
    """ Lasso fitter. If cv True uses CV to fit lasso; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train, y_train)
        
    return lasso

In [None]:
@ray.remote
def fit_par_lasso(train_data, sigma, cv, fit_intercept, threads, alpha_scaling, n_folds=5, max_calls=1):

    """ Parallel version of Lasso regression fitter. If cv True uses CV to fit lasso; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    mkl.set_num_threads(threads)
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train.copy(), y_train.copy())
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train.copy(), y_train.copy())
        
    return lasso

In [None]:
def fit_ridge(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0):
    
    """ ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train, y_train)
    else:
        alphas = np.logspace(-2, 6, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train, y_train)

    return ridge

In [None]:
@ray.remote
def fit_par_ridge(train_data, sigma, cv, fit_intercept, threads, alpha_scaling=1.0, max_calls=1):
    
    """ Parallel version of ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    mkl.set_num_threads(threads) # set number of threads for function.
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")
    # Theoretically Optimal regularization and CV regularizers
    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train.copy(), y_train.copy())
    else:
        alphas = np.logspace(-2, 6, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train.copy(), y_train.copy()) #copy data since parallelizing
        
    return ridge

In [None]:
def fit_all_f_lassos(train_data, sigma, cv, fit_intercept, n_splits, threads, alpha_scaling, n_folds=5):
    
    """Fit all needed f lasso regressions needed for OM estimator when using n_splits-crossfitting"""

    X, y = train_data
    kf = KFold(n_splits=n_splits)
    lassos=ray.get([fit_par_lasso.remote((X[train_index, :], y[train_index]), sigma, cv, fit_intercept, threads=threads, alpha_scaling=alpha_scaling, n_folds=n_folds) for train_index, test_index in kf.split(X)])
    
    return lassos

In [None]:
def fit_all_f_ridges(train_data, sigma, cv, fit_intercept, n_splits, threads, alpha_scaling=1.0):

    """Fit all needed f ridge regressions needed for OM estimator when using n_splits-crossfitting"""
    
    X, y = train_data
    kf = KFold(n_splits=n_splits)
    ridges=ray.get([fit_par_ridge.remote((X[train_index, :], y[train_index]), sigma, cv, fit_intercept, threads=threads, alpha_scaling=alpha_scaling) for train_index, test_index in kf.split(X)])
    
    return ridges

In [None]:
@ray.remote
def fit_g_ridge(train_data, test_point, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, max_calls=1):
    
    """ Parallel function to fit ridge estimator for auxiliary g equations. """
    
    mkl.set_num_threads(threads) # Set number of threads.
    
    warnings.filterwarnings("ignore")
    X = train_data
    kf = KFold(n_splits=n_splits)  # n_split cross fitting.
    
    # builds (scaled) rotation matrix U
    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    t, x = Xrot[:, 0], Xrot[:, 1:]

    model_ts = [fit_ridge((x[train_index, :], t[train_index]), sigma=sigma, cv=cv, fit_intercept=fit_intercept, alpha_scaling=alpha_scaling) for train_index, test_index in kf.split(X)]
    
    return model_ts

In [None]:
def fit_all_g_ridges(train_data, test_data, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, n_folds=5):
    
    warnings.filterwarnings("ignore")
    X_train = train_data
    X_test = test_data
    num_train, p = X_train.shape
    num_test, p = X_test.shape
    
    stuff = num_test * [None]

    id_dict = {fit_g_ridge.remote(train_data=train_data, test_point=X_test[i, :], sigma=sigma, cv=cv, fit_intercept=fit_intercept, threads=threads, n_splits=n_splits, alpha_scaling=alpha_scaling) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    ray.internal.free(old_ids)
            
    return stuff

In [None]:
@ray.remote
def fit_g_lasso(train_data, test_point, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, n_folds=5, max_calls=1):
    
    """Fit lassos for auxiliary g regression """
    
    mkl.set_num_threads(threads)
    
    warnings.filterwarnings("ignore")
    X = train_data
    kf = KFold(n_splits=n_splits)  
    # builds (scaled) rotation matrix U

    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    t, x = Xrot[:, 0], Xrot[:, 1:]

    model_ts = [fit_lasso((x[train_index, :], t[train_index]), sigma=sigma, cv=cv, fit_intercept=fit_intercept, alpha_scaling=alpha_scaling, n_folds=n_folds) for train_index, test_index in kf.split(X)]
    
    return model_ts

In [None]:
def fit_all_g_lassos(train_data, test_data, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, n_folds=5):
    
    """Fit all g lassos for auxiliary g regression """
        
    warnings.filterwarnings("ignore")
    X_train = train_data
    X_test = test_data
    num_train, p = X_train.shape
    num_test, p = X_test.shape
    
    stuff = num_test * [None]

    id_dict = {fit_g_lasso.remote(train_data=train_data, test_point=X_test[i, :], sigma=sigma, cv=cv, fit_intercept=fit_intercept, threads=threads, n_splits=n_splits, alpha_scaling=alpha_scaling) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    ray.internal.free(old_ids)
            
    return stuff

In [None]:
@ray.remote 
def fit_f_moms_1f(data, sigma, test_point, g_bases, f_bases, cv, fit_intercept, n_splits, threads, max_calls=1):
    
    """Parallel function to predict y values using the OM f moments. Makes a single prediction on test_point x_* using OM 
    by rotating all data into the basis where test_point points along e_1. 
    
    Uses (y-\theta t - z^\top f) (t-g(z)) moments. Here f(z) is estimated by regressing y on (t,z) and dropping t.
    
    # data is training data
    # test_point is the prediction direction
    # sigma is the additive noise
    # alpha1 and alpha2 represent the regularization scaling for both lasso regressions respectively needed for prediction
    # cv indicates whether or not to use CV
    # fit_intercept indicates whether or not to fit y-intercepts
    """
    mkl.set_num_threads(threads) # set number of threads for function

    # builds (scaled) rotation matrix U
    X, y = data
    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    Trot, Xrot = Xrot[:, 0], Xrot[:, 1:]

    # arrays to contain p and q residuals from regression 
    res_t = np.zeros(X.shape[0])
    res_y = np.zeros(X.shape[0])

    kf = KFold(n_splits=n_splits) #first stage CF
    
    count=0
    for train_index, test_index in kf.split(X):
        # Split the data in half, train and test
        x_train, t_train, y_train = Xrot[train_index, :], Trot[train_index], y[train_index]
        x_test, t_test, y_test  = Xrot[test_index, :], Trot[test_index], y[test_index]
        
        # Fit Lasso models for "y" in original basis
        model_y_orig = f_bases[count]
        y_coef = model_y_orig.coef_
        y_int = model_y_orig.intercept_
        
        # Rotate "y" coefficients (i.e. f) into new basis and "drop" first t coefficient
        y_coef_rot = (U @ y_coef)[1:]

        model_t = g_bases[count]
        res_t[test_index] = (t_test - model_t.predict(x_test)).flatten()
        res_y[test_index] = (y_test - (x_test @ y_coef_rot + y_int)).flatten()
        count+=1


    num1 = np.mean(res_y * res_t)
    denom1 = np.mean(Trot * res_t)
    pred1 = num1 / denom1
    
    return pred1, num1, denom1

In [None]:
def fit_all_f_moms_1f(train_data, X_test, g_bases, f_bases, sigma, cv, fit_intercept, n_splits, threads):

    """ Make predictions over entire test set in X_test using f_lassos, g_lasso which have already been fit. """
    
    X_train, y_train = train_data 
    _, p = X_train.shape
    
    num_test,_ = X_test.shape
    stuff = num_test * [None]
    
    # Function to make predictions over test set in parallel
    id_dict = {fit_f_moms_1f.remote(train_data,  sigma, X_test[i, :], g_bases[i], f_bases, cv=cv, fit_intercept=fit_intercept, n_splits=n_splits, threads=threads) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    
    pred1s, num1s, denom1s = zip(*stuff)
    ray.internal.free(old_ids)

    return (np.array(pred1s), np.array(num1s), np.array(denom1s))

In [None]:
def run_OM_expt(p, train_n, test_n, s, train_dist, test_dist, cv, fit_intercepts, n_splits, x_scale_test, beta_scale, sigma, alpha_scalings, main_reg_params, aux_reg_params, seed, threads):
    """Runs an experiment for a single problem instance with given values of p, n, s."""

    # Options to specify whether intercepts should be fit and the scaling of the regularizers
    fit_intercept_main, fit_intercept_g = fit_intercepts
    warnings.filterwarnings("ignore")
    alpha1, alpha2 = alpha_scalings
    
    # Generate training and test data
    np.random.seed(seed)
    X_train, y_train, coef, scaler, kappa = gen_train_data(n=train_n, p=p, s=s, train_dist=train_dist, x_scale=1.0, beta_scale=beta_scale, sigma=sigma)
    X_test, y_test = gen_test_data(test_n, p, s, coef, X_scaler=scaler, test_dist=test_dist, x_scale=x_scale_test, sigma=0*sigma)
    assert(fit_intercept_main==False, "This should be set to false for all f regression. g_lasso has been manually set to include this")
    
    # Demean y values
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    # Parameters for Main (f) and Auxiliary (g) regressions.
    aux_reg = aux_reg_params["method"]
    main_reg = main_reg_params["method"]
    
    if main_reg=="Lasso":
        f_base = fit_all_f_lassos((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads, alpha_scaling=alpha1, n_folds=5)
    elif main_reg=="Ridge":
        f_base = fit_all_f_ridges((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads, alpha_scaling=alpha1)
    
    aux_reg = aux_reg_params["method"]
    if aux_reg=="Lasso":
        g_base = fit_all_g_lassos(X_train, test_data=X_test, sigma=sigma, cv=cv, fit_intercept=fit_intercept_g, n_splits=n_splits, threads=threads, alpha_scaling=alpha2, n_folds=5)
    if aux_reg=="Ridge":
        g_base = fit_all_g_ridges(X_train, test_data=X_test, sigma=sigma, cv=cv, fit_intercept=fit_intercept_g, n_splits=n_splits, threads=threads, alpha_scaling=alpha2)
    
    fo1f = fit_all_f_moms_1f(train_data=(X_train, y_train),  X_test=X_test, g_bases=g_base, f_bases=f_base, sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads)
    
    
    if main_reg=="Lasso":
        base = fit_lasso((X_train, y_train), sigma, alpha_scaling=alpha1, cv=cv, fit_intercept=fit_intercept_main, n_folds=5)
        base_preds = base.predict(X_test)
    elif main_reg=="Ridge":
        base = fit_ridge((X_train, y_train), sigma, alpha_scaling=alpha1, cv=cv, fit_intercept=fit_intercept_main)
        base_preds = base.predict(X_test)
    
    kappa=cond(X_train)
    
    return fo1f, base_preds, y_test, kappa, mu_y

In [None]:
def save_OM_expts(p_list, n_list, s_list, train_dist, test_dist, cv, fit_intercepts, n_splits, x_scale_test, beta_scale, sigma, alpha_scalings, main_reg_params, aux_reg_params, seeds, threads, reps, folder_path):

    """Runs multiple trials of an OM experiments for various values of p, n, s and saves that data in pkl file"""
    save_data = OrderedDict()
    
    fit_intercept_main, fit_intercept_g = fit_intercepts
    save_data["train_dist"]=str(train_dist)
    save_data["test_dist"]=str(test_dist)
    save_data["method"]="f moments"
    save_data["main_reg"]="CV"+str(cv)
    save_data["fit_intercepts"]=str(fit_intercepts)
    save_data["format_keys"] = "p,n,s"
    save_data["output"] = "fo1f, lasso_preds, y_test, kappa, mu_y" + ";" + "preds, num, denom"
    save_data["n_splits"] = n_splits
    save_data["aux_reg_params"] = str(aux_reg_params)
    save_data["test_n"] = str(n_list[0][1])
    save_data["seeds"] = seeds
    save_data["scale_X"] = True
    save_data["x_scale_test"] = x_scale_test
    save_data["beta_scale"] = beta_scale
    save_data["sigma"] = sigma
    save_data["alpha_scalings"] = alpha_scalings[0]
    save_data["reps"] = reps
    save_data["base_regression"] = main_reg_params["method"]

    save_data["p_list"] = p_list
    save_data["n_list"] = n_list
    save_data["s_list"] = s_list
    
    print("Starting")
    for p in p_list:
        for train_n, test_n in n_list:
            for s in s_list:
                OML_fo1f_preds=[]
                OML_fo1f_nums=[]
                OML_fo1f_denoms=[]

                base_preds=[]
                y_tests=[] 
                kappas=[]
                print(str(p)+","+str(train_n)+","+str(s))
                    
                for rep in range(reps):
                    fo1f, base_pred, y_test, kappa, mu_y = run_OM_expt(p, train_n, test_n, s, train_dist=train_dist, test_dist=test_dist, cv=cv, fit_intercepts=fit_intercepts, n_splits=n_splits, x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, alpha_scalings=alpha_scalings[1], main_reg_params=main_reg_params, 
                                                                               aux_reg_params=aux_reg_params, threads=threads, seed=seeds[rep])
                    OML_fo1f_preds.append(fo1f[0]+mu_y)
                    OML_fo1f_nums.append(fo1f[1])
                    OML_fo1f_denoms.append(fo1f[2])

                    base_preds.append(base_pred+mu_y)
                    y_tests.append(y_test)
                    kappas.append(kappa)

                    save_data[(p,train_n,s)] = [
                            OML_fo1f_preds,
                            OML_fo1f_nums,
                            OML_fo1f_denoms,
                            base_preds,
                            y_tests,
                            kappas]

    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = "OM"+"_"+timestr+".pickle"
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [None]:
@ray.remote
def invert(X, x_star, lam, threads=1):

    """Parallel Function to solve JM program"""
    
    warnings.filterwarnings("ignore")
    mkl.set_num_threads(threads)
    try:
        n, p = X.shape
        eps = 1e-12
        Sigma_n = 1/float(n) * np.transpose(X) @ X  + eps*np.eye(p)
        #adding epsilon to make matrix strictly p.s.d. or else cvxpy throws not DCP error (cannot recognize convexity of objective)

        w = cvx.Variable(p)
        obj = cvx.Minimize(cvx.quad_form(w, Sigma_n))
        const = [cvx.norm(Sigma_n * w  - x_star, "inf") <= lam]

        prob = cvx.Problem(obj, const)
        sol = prob.solve(solver=cvx.MOSEK, mosek_params={mosek.iparam.num_threads: threads})
        
        return sol, w.value
    except:
        # If solver fails simply return None value for w
        return math.inf, None

In [None]:
def compute_ws(X_train, X_test, lams, threads):
    
    """ Computes the entire set of w's needed for X_test using X_train """
    
    warnings.filterwarnings("ignore")
    train_n, p = X_train.shape
    test_n, p = X_test.shape

    ids=[]
    lam_deb, method = lams[0], lams[1]
    for i in range(test_n):
        x_star = X_test[i, :]
        x_norm = np.linalg.norm(x_star, ord=2)
        
        # Computes Debiasing Correction using Theoretical Value of \lambda_w
        if method=="theory":
            if (train_n >= 1.5*p):
                lam_deb = .01*lam_deb
                ids.append(invert.remote(X_train, x_star, lam=lam_deb*x_norm, threads=threads))
            else:
                ids.append(invert.remote(X_train, x_star, lam=lam_deb*x_norm, threads=threads))
        # Computes Debiasing Correction for a fixed \lambda_w
        elif method=="grid":
            ids.append(invert.remote(X_train, x_star, lam=lam_deb*x_norm, threads=threads))
        else:
            raise Exception("Aux Debiasing Set Incorrectly")
    vals = ray.get(ids) # Get values from Ray
    ray.internal.free(ids) 
    
    return vals

In [None]:
def debias_base(train_data, base_model, X_test, ws):
    
    """Computes JM Debiased Predictions with respect to the base model using base_model and learned ws"""
    
    warnings.filterwarnings("ignore")
    X_train, y_train = train_data
    train_n, p = X_train.shape
    test_n, p = X_test.shape
    
    beta = base_model.coef_
    beta_int = base_model.intercept_

    y_preds=np.zeros(test_n)
    resid = np.transpose(X_train) @ (y_train-X_train @ beta-beta_int)
    feasible=[]
    for i in range(test_n):
        x_star = X_test[i, :]
        #Lasso Prediction
        y_preds[i] = x_star.dot(beta) + beta_int
        val, w = ws[i]
        #debiasing correction
        if val==math.inf:
            feasible.append(False)
        else:
            y_preds[i] += 1/float(train_n) * np.ravel(w).dot(resid)
            feasible.append(True)
    
    return y_preds, feasible

In [None]:
def run_JM_expt(p, main_reg_params, train_n, test_n, s, train_dist, test_dist, cv, fit_intercept, alpha_scaling, x_scale_test, beta_scale, sigma, lams, threads, seed):
    
    """Runs a JM experiment for a single problem instance with given values of p, n, s."""
    
    np.random.seed(seed)
    # Generate training and test data
    X_train, y_train, coef, scaler, kappa = gen_train_data(n=train_n, p=p, s=s, train_dist=train_dist, x_scale=1.0, beta_scale=beta_scale, sigma=sigma)
    X_test, y_test = gen_test_data(test_n, p, s, coef, X_scaler=scaler, test_dist=test_dist, x_scale=x_scale_test, sigma=0*sigma)
    assert(fit_intercept==False, "This should be set to false for all f regression. g_lasso has been manually set to include this")
    
    # Demean y values
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    # Parameters for Main (f) and Auxiliary (g) regressions.
    main_reg = main_reg_params["method"]
    
    # Fit Main Base Regression
    if main_reg=="Lasso":
        base = fit_lasso((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept, alpha_scaling=alpha_scaling, n_folds=5)
        base_preds = base.predict(X_test)
    elif main_reg=="Ridge":
        base = fit_ridge((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept, alpha_scaling=alpha_scaling)
        base_preds = base.predict(X_test)
        
    # Compute Debiased Predictions Either Using Theoretical Values or Entire Grid
    lam_vals, method = lams[0], lams[1]
    if method=="theory":
        lam_deb = 1.5*math.sqrt(math.log(p)/train_n)
        ws = compute_ws(X_train, X_test, lams=(lam_deb, method), threads=threads)
        deb_pred, feasible = debias_base((X_train, y_train), base, X_test, ws)
        deb_preds = [deb_pred]
        feasibles = [feasible]
        ws = [ws]
    elif method=="grid":
        deb_preds=[]
        feasibles=[]
        ws=[]
        for lam_deb in lam_vals:
            w = compute_ws(X_train, X_test, lams=(lam_deb, method), threads=threads)
            deb_pred, feasible = debias_base((X_train, y_train), base, X_test, w)
            deb_preds.append(deb_pred)
            feasibles.append(feasible)
        
    return deb_preds, base_preds, ws, y_test, kappa, feasibles, mu_y

In [None]:
def save_JM_expts(p_list, n_list, s_list, main_reg_params, train_dist, test_dist, cv, fit_intercepts, alpha_scalings, x_scale_test, beta_scale, sigma, lams, seeds, threads, reps, folder_path):

    """Runs multiple trials of an JM experiments for various values of p, n, s and saves that data in pkl file"""
    
    save_data = OrderedDict()
    
    fit_intercept_main, _ = fit_intercepts
    save_data["train_dist"]=str(train_dist)
    save_data["test_dist"]=str(test_dist)
    save_data["method"]="JM Debiased Lasso"
    save_data["main_reg"]="CV"+str(cv)
    save_data["fit_intercepts"]=str(fit_intercepts)
    save_data["format_keys"] = "p,n,s"
    save_data["output"] = "deb_preds, lasso_preds, ws, y_test, kappa"
    save_data["test_n"] = str(n_list[0][1])
    save_data["seeds"] = seeds
    save_data["scale_X"] = True
    save_data["x_scale_test"] = x_scale_test
    save_data["beta_scale"] = beta_scale
    save_data["sigma"] = sigma
    save_data["alpha_scalings"] = alpha_scalings[0]
    save_data["reps"] = reps
    save_data["lam"] = lams[0]
    save_data["lam_method"] = lams[1]
    save_data["base_regression"] = main_reg_params
    
    save_data["p_list"] = p_list
    save_data["n_list"] = n_list
    save_data["s_list"] = s_list

    print("Starting")
    for p in p_list:
        for train_n, test_n in n_list:
            for s in s_list:
                deb_preds=[]
                base_preds=[]
                y_tests=[] 
                kappas=[]
                feasibles=[]
                regs=[]
                if (s <= p and float(s)/p >= 0.1):
                    print(str(p)+","+str(train_n)+","+str(s))
                    for rep in range(reps):
                        deb_pred, base_pred, w, y_test, kappa, feasible, mu_y = run_JM_expt(p=p, main_reg_params=main_reg_params, train_n=train_n, test_n=test_n, s=s, train_dist=train_dist, test_dist=test_dist, cv=cv, fit_intercept=fit_intercept_main, alpha_scaling=alpha_scalings[1], x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, lams=lams, threads=threads, seed=seeds[rep])
                        deb_preds.append([i+mu_y for i in deb_pred])
                        base_preds.append(base_pred+mu_y)
                        y_tests.append(y_test)
                        kappas.append(kappa)
                        feasibles.append(feasible)
                save_data[(p,train_n,s)] = [
                        deb_preds,
                        base_preds,
                        y_tests,
                        kappas,
                        feasibles]

    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = "JM"+"_"+timestr+".pickle"
    
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [None]:
# Code to set up Experiment to Reproduce Synthetic Experiments for Lasso Debiasing involving both fixed and cross-validated hyperparameters for regression.

In [None]:
p_list = [200]
n_list = [item for item in itertools.product([50, 100, 200, 400, 800, 1600, 3200], [500])]
s_list = [20, 100]

In [None]:
train_dists=[("normal", {}), ("normal", {}), ("normal", {})]
test_dists=[("normal", {}), ("normal+support_shift", {"scale" : 10.0}), ("normal+support_var", {"support_scale" : 10.0})]

dists = list(zip(train_dists, test_dists))
scaling_params = [(1.0, 1.0, 1.0)]
alpha_scalings_JM = [4.0]
alpha_scalings_OML = [(4.0, 4.0)]

OML_aux_regs=["Lasso"]
JM_aux_params=[(0, "theory")]
                
splits = [5]
fit_intercepts=(False, False)

path_options="synthetic_lasso_expts"
reps=20
seeds=[100+i for i in range(reps)]

main_regs=[{"method": "Lasso"}]
aux_regs=[{"method": "Lasso"}]

threads=1 #Should be to the number of threads desired for each regression

now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
total=0
for (train_dist, test_dist) in dists:
    for split in splits:
        for aux_reg in aux_regs:
            for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_OML, scaling_params)):
                total+=1
print(total)

In [None]:
count=0
for (train_dist, test_dist) in dists:
    for n_split in splits:
        for main_reg_params in main_regs:
            for aux_reg_params in aux_regs:
                for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_OML, scaling_params)):
                    count+=1
                    print("{0:.0%}".format(float(count)/total)+" Done")
                    data_cv_false_OM = save_OM_expts(p_list, n_list, s_list, train_dist=train_dist, test_dist=test_dist, cv=False, fit_intercepts=fit_intercepts, n_splits=n_split, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, main_reg_params=main_reg_params, aux_reg_params=aux_reg_params, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)
                    data_cv_true_OM = save_OM_expts(p_list, n_list, s_list, train_dist=train_dist, test_dist=test_dist, cv=True, fit_intercepts=fit_intercepts, n_splits=n_split, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, main_reg_params=main_reg_params, aux_reg_params=aux_reg_params, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)

In [None]:
total=0
for (train_dist, test_dist) in dists:
    for (alpha_scalings, (x_scale_test, beta_scale, sigma)) in zip(alpha_scalings_JM, scaling_params):
        for lams in JM_aux_params:
            total+=1
print(total)

In [None]:
count=0
for (train_dist, test_dist) in dists:
    for main_reg_params in main_regs:
        for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_JM, scaling_params)):
            for lams in JM_aux_params:
                count+=1
                print("{0:.0%}".format(float(count)/total)+" Done")
                data_cv_false_JM = save_JM_expts(p_list, n_list, s_list, main_reg_params=main_reg_params, train_dist=train_dist, test_dist=test_dist, cv=False, fit_intercepts=fit_intercepts, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, lams=lams, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)
                data_cv_true_JM = save_JM_expts(p_list, n_list, s_list, main_reg_params=main_reg_params, train_dist=train_dist, test_dist=test_dist, cv=True, fit_intercepts=fit_intercepts, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, lams=lams, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)

In [None]:
# Code to set up Experiment to Reproduce Synthetic Experiments for Ridge Regression Debiasing involving both fixed and cross-validated hyperparameters for regression.

In [None]:
p_list = [200]
n_list = [item for item in itertools.product([50, 100, 200, 400, 800, 1600, 3200], [500])]
s_list = [200]

In [None]:
train_dists=[("normal", {}), ("normal", {})]
test_dists=[("normal+support_shift", {"scale" : 10.0}), ("normal+support_rank_one", {"support_scale" : 10.0})]

dists = list(zip(train_dists, test_dists))
p=p_list[0]
scaling_params = [(1.0, 1.0/math.sqrt(p), 1.0)]

alpha_scalings_JM = [p/(p*i[1]**2) for i in scaling_params]
alpha_scalings_OM = [(p/(p*i[1]**2), 1e+12) for i in scaling_params]
OML_aux_regs=["Ridge"]
JM_aux_params=[(0, "theory")]

splits = [5]
fit_intercepts=(False, False)

path_options="synthetic_ridge_expts"
reps=20
seeds=[100+i for i in range(reps)]

main_regs=[{"method": "Ridge"}]
aux_regs=[{"method": "Ridge"}]
threads=8 #Should be to the number of threads desired for each regression

now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
print("Starting Ridge")

In [None]:
total=0
for (train_dist, test_dist) in dists:
    for split in splits:
        for aux_reg in aux_regs:
            for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_OM, scaling_params)):
                total+=1
print(total)

In [None]:
count=0
for (train_dist, test_dist) in dists:
    for n_split in splits:
        for main_reg_params in main_regs:
            for aux_reg_params in aux_regs:
                for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_OM, scaling_params)):
                    count+=1
                    print("{0:.0%}".format(float(count)/total)+" Done")
                    data_cv_false_OM = save_OM_expts(p_list, n_list, s_list, train_dist=train_dist, test_dist=test_dist, cv=False, fit_intercepts=fit_intercepts, n_splits=n_split, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, main_reg_params=main_reg_params, aux_reg_params=aux_reg_params, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)
                    data_cv_true_OM = save_OM_expts(p_list, n_list, s_list, train_dist=train_dist, test_dist=test_dist, cv=True, fit_intercepts=fit_intercepts, n_splits=n_split, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, main_reg_params=main_reg_params, aux_reg_params=aux_reg_params, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)

In [None]:
total=0
for (train_dist, test_dist) in dists:
    for (alpha_scalings, (x_scale_test, beta_scale, sigma)) in zip(alpha_scalings_JM, scaling_params):
        for lams in JM_aux_params:
            total+=1
print(total)

In [None]:
count=0
for (train_dist, test_dist) in dists:
    for main_reg_params in main_regs:
        for i, (alpha_scalings, (x_scale_test, beta_scale, sigma)) in enumerate(zip(alpha_scalings_JM, scaling_params)):
            for lams in JM_aux_params:
                count+=1
                print("{0:.0%}".format(float(count)/total)+" Done")
                data_cv_false_JM = save_JM_expts(p_list, n_list, s_list, main_reg_params=main_reg_params, train_dist=train_dist, test_dist=test_dist, cv=False, fit_intercepts=fit_intercepts, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, lams=lams, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)
                data_cv_true_JM = save_JM_expts(p_list, n_list, s_list, main_reg_params=main_reg_params, train_dist=train_dist, test_dist=test_dist, cv=True, fit_intercepts=fit_intercepts, alpha_scalings=(i, alpha_scalings), x_scale_test=x_scale_test, beta_scale=beta_scale, sigma=sigma, lams=lams, seeds=seeds, threads=threads, reps=reps, folder_path=folder_path)