In [None]:
__author__ = 'nileshtrip'
import numpy as np
import math
import subprocess
import itertools
import gc
import copy
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from numpy import transpose as trans
from collections import OrderedDict
import datetime

import mkl
import pickle
import time
import os
import ray
import warnings
warnings.filterwarnings("ignore")
subprocess.call("bash convert_files.sh", shell=True)

from auxiliary import is_pos_def, cond, rotate_matrix, gen_train_data, gen_test_data
from datasets import load_parkinson, load_triazines,  load_wine, load_fertility, load_forest_fires, load_weather

In [None]:
def chunk_data(X_test, y_test, chunk_size):
    
    """Divides X_test and y_test into list of chunks of dataset (in order to parallelize)"""
    
    num = X_test.shape[0]
    num_chunks = math.floor(num/chunk_size)
    print("Chunking")
    
    chunks=[]
    if num_chunks > 0:
        for i in range(num_chunks):
            a = i*chunk_size
            b = (i+1)*chunk_size
            chunks.append((X_test[a:b, :], y_test[a:b]))

        chunks.append((X_test[b:, :], y_test[b:]))
    else:
        chunks.append((X_test, y_test))
        
    return chunks

In [None]:
def fit_lasso(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0, n_folds=5):
    
    """ Lasso fitter. If cv True uses CV to fit; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train, y_train)
        
    return lasso

In [None]:
def fit_ridge(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0):
    
    """ ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train, y_train)

    return ridge

In [None]:
def fit_elastic(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0, n_folds=5):
    
    """ elastic net fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import ElasticNet
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers
    
    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2*(math.log(p)/n))
        elastic = ElasticNet(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        elastic.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        ratio = [.1, .5, .7, .9, .95, .99, 1]
        elastic=ElasticNetCV(max_iter=5000, cv=n_folds, l1_ratio = ratio, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        elastic.fit(X_train, y_train)

    return elastic

In [None]:
def fit_rf(train_data, n_estimators, random_state, max_depth):
    
    """ RF fitter. n_estimators indicates number of decision trees, and max_depth indicates max tree depth."""
   
    X_train, y_train = train_data
    n, p = X_train.shape
    
    from sklearn.ensemble import RandomForestRegressor
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth)  
    regressor.fit(X_train, y_train)  

    return regressor

In [None]:
@ray.remote
def fit_par_lasso(train_data, sigma, cv, fit_intercept, threads, alpha_scaling, n_folds=5, max_calls=1):

    """ Parallel version of Lasso regression fitter. If cv True uses CV to fit; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    mkl.set_num_threads(threads)
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train.copy(), y_train.copy())
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train.copy(), y_train.copy())
        
    return lasso

In [None]:
@ray.remote
def fit_par_ridge(train_data, sigma, cv, fit_intercept, threads, alpha_scaling=1.0, max_calls=1):
    
    """ Parallel version of ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    mkl.set_num_threads(threads) # set number of threads for function.
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")
    # Theoretically Optimal regularization and CV regularizers
    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train.copy(), y_train.copy())
    else:
        alphas = np.logspace(-6, 1, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train.copy(), y_train.copy()) #copy data since parallelizing
        
    return ridge

In [None]:
@ray.remote
def fit_par_elastic(train_data, sigma, cv, fit_intercept, threads, alpha_scaling=1.0, n_folds=5, max_calls=1):
    
    """ Parallel version of elastic net fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    
    
    mkl.set_num_threads(threads)
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import ElasticNet
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        elastic = ElasticNet(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        elastic.fit(X_train.copy(), y_train.copy())
    else:
        alphas = np.logspace(-6, 1, num=100)
        ratio = [.1, .5, .7, .9, .95, .99, 1]
        elastic=ElasticNetCV(max_iter=5000, cv=n_folds, l1_ratio = ratio, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        elastic.fit(X_train.copy(), y_train.copy())
        
    return elastic

In [None]:
def fit_all_f_lassos(train_data, sigma, cv, fit_intercept, n_splits, threads, alpha_scaling, n_folds=5):
    
    """Fit all needed f lasso regressions needed for OM estimator when using n_splits-crossfitting"""

    X, y = train_data
    kf = KFold(n_splits=n_splits)
    lassos=ray.get([fit_par_lasso.remote((X[train_index, :], y[train_index]), sigma, cv, fit_intercept, threads=threads, alpha_scaling=alpha_scaling, n_folds=n_folds) for train_index, test_index in kf.split(X)])
    
    return lassos

In [None]:
def fit_all_f_ridges(train_data, sigma, cv, fit_intercept, n_splits, threads, alpha_scaling=1.0):

    """Fit all needed f ridge regressions needed for OM estimator when using n_splits-crossfitting"""
    
    X, y = train_data
    kf = KFold(n_splits=n_splits)
    ridges=ray.get([fit_par_ridge.remote((X[train_index, :], y[train_index]), sigma, cv, fit_intercept, threads=threads, alpha_scaling=alpha_scaling) for train_index, test_index in kf.split(X)])
    
    return ridges

In [None]:
def fit_all_f_elastics(train_data, sigma, cv, fit_intercept, n_splits, threads, alpha_scaling=1.0, n_folds=5):

    """Fit all needed f elastic net regressions needed for OM estimator when using n_splits-crossfitting"""
        
    X, y = train_data
    kf = KFold(n_splits=n_splits)
    elastics=ray.get([fit_par_elastic.remote((X[train_index, :], y[train_index]), sigma, cv, fit_intercept, threads=threads, alpha_scaling=alpha_scaling, n_folds=n_folds) for train_index, test_index in kf.split(X)])
    
    return elastics

In [None]:
@ray.remote
def fit_g_lasso(train_data, test_point, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, n_folds=5, max_calls=1):
    
    """Fit lassos for auxiliary g regression """
    
    mkl.set_num_threads(threads)
    
    warnings.filterwarnings("ignore")
    X = train_data
    n = X.shape[0]
    kf = KFold(n_splits=n_splits)  
    # builds (scaled) rotation matrix U

    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    t, x = Xrot[:, 0], Xrot[:, 1:]

    model_ts = [fit_lasso((x[train_index, :], t[train_index]), sigma=sigma, cv=cv, fit_intercept=fit_intercept, alpha_scaling=alpha_scaling, n_folds=n_folds) for train_index, test_index in kf.split(X)]
    
    i=0
    res_t=np.zeros(n)
    for train_index, test_index in kf.split(X):
        res_t[test_index] = t[test_index]-model_ts[i].predict(x[test_index, :])
        i+=1
    
    return model_ts, res_t, t

In [None]:
def fit_all_g_lassos(train_data, test_data, sigma, cv, fit_intercept, threads, n_splits, alpha_scaling, n_folds=5):
    
    """Fit all g lassos for auxiliary g regression """
        
    warnings.filterwarnings("ignore")
    X_train = train_data
    X_test = test_data
    num_train, p = X_train.shape
    num_test, p = X_test.shape
    
    stuff = num_test * [None]

    id_dict = {fit_g_lasso.remote(train_data=train_data, test_point=X_test[i, :], sigma=sigma, cv=cv, fit_intercept=fit_intercept, threads=threads, n_splits=n_splits, alpha_scaling=alpha_scaling) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    ray.internal.free(old_ids)
            
    return stuff

In [None]:
@ray.remote
def fit_g_rf(train_data, test_point, n_splits, n_estimators, random_state, max_depth, threads, max_calls=1):
   
    """Fit g RFs for auxiliary g regression """
    
    
    mkl.set_num_threads(threads)
    warnings.filterwarnings("ignore")
    
    X = train_data
    n = X.shape[0]
    kf = KFold(n_splits=n_splits)  
    # builds (scaled) rotation matrix U

    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    t, x = Xrot[:, 0], Xrot[:, 1:]

    model_ts = [fit_rf((x[train_index, :], t[train_index]), n_estimators, random_state, max_depth) for train_index, test_index in kf.split(X)]
    
    i=0
    res_t=np.zeros(n)
    for train_index, test_index in kf.split(X):
        res_t[test_index] = t[test_index]-model_ts[i].predict(x[test_index, :])
        i+=1
        
    return model_ts, res_t, t

In [None]:
def fit_all_g_rfs(train_data, test_data, n_splits, n_estimators, threads, max_depth, random_state):
    
    """Fit all g RFs for auxiliary g regression """
    
    warnings.filterwarnings("ignore")
    X_train = train_data
    X_test = test_data
    num_train, p = X_train.shape
    num_test, p = X_test.shape
    
    stuff = num_test * [None]

    id_dict = {fit_g_rf.remote(train_data=train_data, test_point=X_test[i, :], n_splits=n_splits, n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, threads=threads) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
            
    ray.internal.free(old_ids)
            
    return stuff

In [None]:
def fit_all_g_zeros(X_train, X_test, n_splits):
    
    """Fit all g 0s for auxiliary g regression """
    
    warnings.filterwarnings("ignore")
    num_train, p = X_train.shape
    num_test, _ = X_test.shape
    
    kf = KFold(n_splits=n_splits)  
    # builds (scaled) rotation matrix U

    all_models=[]
    for i in range(X_test.shape[0]):
        test_point = X_test[i, :]
        test_norm = np.linalg.norm(test_point, 2)
        U = test_norm * rotate_matrix(test_point)
        U_inv = np.linalg.inv(U)

        # construct rotated data matrix (which uses U_inv^\top)
        Xrot = X_train @ U_inv
        # split features into "T" and remaining "X"
        t, x = Xrot[:, 0], Xrot[:, 1:]

        j=0
        model_ts_new=[]
        for train_index, test_index in kf.split(X_train):
            model_ts_new.append(None)
            j+=1
        all_models.append((model_ts_new, t, t))
            
    return all_models

In [None]:
@ray.remote 
def fit_f_moms_1f(data, sigma, test_point, g_bases, f_bases, cv, fit_intercept, n_splits, threads):
    
    """Parallel function to predict y values using the OM f moments. Makes a single prediction on test_point x_* using OM 
    by rotating all data into the basis where test_point points along e_1. 
    
    Uses (y-\theta t - z^\top f) (t-g(z)) moments. Here f(z) is estimated by regressing y on (t,z) and dropping t.
    
    # data is training data
    # test_point is the prediction direction
    # sigma is the additive noise
    # alpha1 and alpha2 represent the regularization scaling for both lasso regressions respectively needed for prediction
    # cv indicates whether or not to use CV
    # fit_intercept indicates whether or not to fit y-intercepts
    """
    mkl.set_num_threads(threads) # set number of threads for function

    # builds (scaled) rotation matrix U
    X, y = data
    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)
    
    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    Trot, Xrot = Xrot[:, 0], Xrot[:, 1:]

    # arrays to contain p and q residuals from regression 
    res_t = np.zeros(X.shape[0])
    res_y = np.zeros(X.shape[0])

    kf = KFold(n_splits=n_splits) #first stage CF
    
    count=0
    for train_index, test_index in kf.split(X):
        # Split the data in half, train and test
        x_train, t_train, y_train = Xrot[train_index, :], Trot[train_index], y[train_index]
        x_test, t_test, y_test  = Xrot[test_index, :], Trot[test_index], y[test_index]
        
        # Fit Lasso models for "y" in original basis
        model_y_orig = f_bases[count]
        y_coef = model_y_orig.coef_
        y_int = model_y_orig.intercept_
        
        # Rotate "y" coefficients (i.e. f) into new basis and "drop" first t coefficient
        y_coef_rot = (U @ y_coef)[1:]

        model_t = g_bases[count]
        if model_t==None:
            res_t[test_index] = (t_test - 0).flatten()
        else:
            res_t[test_index] = (t_test - model_t.predict(x_test)).flatten()

        res_y[test_index] = (y_test - (x_test @ y_coef_rot + y_int)).flatten()
        count+=1

    num1 = np.mean(res_y * res_t)
    denom1 = np.mean(Trot * res_t)
    pred1 = num1 / denom1
    
    return pred1, num1, denom1

In [None]:
@ray.remote 
def fit_q_moms(data, sigma, test_point, g_bases, f_method, cv, fit_intercept, n_splits, threads):
    
    """Parallel function to predict y values using the OM q moments. Makes a single prediction on test_point x_* using OM 
    by rotating all data into the basis where test_point points along e_1. 
    
    Uses (y-q(z)-theta(t-g(z)))(t-g(z)) moments. Here q(z) is estimated by regressing y on z and then directly regressing t on z.
    
    # data is training data
    # test_point is the prediction direction
    # sigma is the additive noise
    # alpha1 and alpha2 represent the regularization scaling for both lasso regressions respectively needed for prediction
    # cv indicates whether or not to use CV
    # fit_intercept indicates whether or not to fit y-intercepts
    """
    mkl.set_num_threads(threads)
        
    from sklearn.linear_model import LassoCV

    # builds (scaled) rotation matrix U
    X, y = data
    test_norm = np.linalg.norm(test_point, 2)
    U = test_norm * rotate_matrix(test_point)
    U_inv = np.linalg.inv(U)

    # construct rotated data matrix (which uses U_inv^\top)
    Xrot = X @ U_inv
    # split features into "T" and remaining "X"
    Trot, Xrot = Xrot[:, 0], Xrot[:, 1:]

    # arrays to contain p and q residuals from regression (*remainder of code similar to OML code paper)
    res_t = np.zeros(X.shape[0])
    res_y = np.zeros(X.shape[0])
    
    kf = KFold(n_splits=n_splits)
    #first stage CF
    count=0
    for train_index, test_index in kf.split(X):
        # Split the data in half, train and test
        x_train, t_train, y_train = Xrot[train_index, :], Trot[train_index], y[train_index]
        x_test, t_test, y_test  = Xrot[test_index, :], Trot[test_index], y[test_index]
        
        # Fit with Lassso the treatment as a function of x and the outcome as
        # a function of x, using only the train fold
        if f_method=="Lasso":
            model_y = fit_lasso((x_train, y_train), sigma, cv=cv, fit_intercept=fit_intercept)
        elif f_method=="Ridge":
            model_y = fit_ridge((x_train, y_train), sigma, cv=cv, fit_intercept=fit_intercept)
        elif f_method=="Elastic":
            model_y = fit_elastic((x_train, y_train), sigma, cv=cv, fit_intercept=fit_intercept)
        else:
            raise Exception("Main Set Incorrectly")
        # Then compute residuals t-g(x) and y-z(x) on test fold
        
        model_t = g_bases[count]
        if model_t==None:
            res_t[test_index] = (t_test - 0).flatten()
        else:
            res_t[test_index] = (t_test - model_t.predict(x_test)).flatten()

        res_y[test_index] = (y_test - model_y.predict(x_test)).flatten()
        count+=1
        
    num1 = np.mean(res_y * res_t)
    denom1 = np.mean(res_t * res_t)
    pred1 = num1 / denom1
    
    return pred1, num1, denom1

In [None]:
def fit_all_f_moms_1f(train_data, X_test, g_bases, f_bases, sigma, cv, fit_intercept, n_splits, threads):

    """ Make predictions over entire test set in X_test using f_bases, g_bases which have already been fit. """
    
    X_train, y_train = train_data 
    _, p = X_train.shape
    
    num_test,_ = X_test.shape
    stuff = num_test * [None]
    
    # Function to make predictions over test set in parallel
    id_dict = {fit_f_moms_1f.remote(train_data, sigma, X_test[i, :], g_bases[i], f_bases, cv=cv, fit_intercept=fit_intercept, n_splits=n_splits, threads=threads) : i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    
    pred1s, num1s, denom1s = zip(*stuff)
    ray.internal.free(old_ids)

    return (np.array(pred1s), np.array(num1s), np.array(denom1s))

In [None]:
def fit_all_q_moms(train_data, X_test, g_bases, f_method, sigma, cv, fit_intercept, n_splits, threads):

    """ Make predictions over entire test set in X_test using f_method, g_bases which have already been fit. """
        
    X_train, y_train = train_data 
    _, p = X_train.shape
    
    num_test,_ = X_test.shape
    stuff = num_test * [None]
    
    id_dict = {fit_q_moms.remote(train_data, sigma, X_test[i, :], g_bases[i], f_method, cv=cv, fit_intercept=fit_intercept, n_splits=n_splits, threads=threads): i for i in range(num_test)}
    ids = list(id_dict.keys())
    old_ids = copy.deepcopy(ids)
    
    while len(ids) > 0:
        ready_ids, ids = ray.wait(ids)
        for ready_id in ready_ids:
            stuff[id_dict[ready_id]] = ray.get(ready_id)
    
    pred1s, num1s, denom1s = zip(*stuff)
    ray.internal.free(old_ids)

    return (np.array(pred1s), np.array(num1s), np.array(denom1s))

In [None]:
def asymp_var_q(res_t):
    
    denom = (np.mean(res_t * res_t))**2
    num = np.var(res_t)
    
    var = num/denom
    
    return var

In [None]:
def asymp_var_f(res_t, t):
    
    denom = (np.mean(res_t * t))**2
    num = np.var(res_t)
    
    var = num/denom
    
    return var

In [None]:
def get_best(g_lassos, g_rfs1, g_zeros, n_splits, num_test, aux):
    
    best_method=[None for i in range(num_test)]
    best_model = [[None for i in range(n_splits)] for i in range(num_test)]
    for i in range(num_test):
        if aux=="q":
            lasso_var = asymp_var_q(g_lassos[i][1])
            rf1_var = asymp_var_q(g_rfs1[i][1])
            zeros_var = asymp_var_q(g_zeros[i][1])
        elif aux=="f":
            lasso_var = asymp_var_f(g_lassos[i][1], g_lassos[i][2])
            rf1_var = asymp_var_f(g_rfs1[i][1], g_rfs1[i][2])
            zeros_var = asymp_var_f(g_zeros[i][1], g_zeros[i][2])
        for j in range(n_splits):
            if lasso_var < rf1_var and lasso_var < zeros_var:
                best_model[i][j] = g_lassos[i][0][j]
                best_method[i] = "Lasso"
            elif rf1_var < lasso_var and rf1_var < zeros_var:
                best_model[i][j] = g_rfs1[i][0][j]
                best_method[i] = "RF_big"
            else:
                best_model[i][j] = g_zeros[i][0][j]
                best_method[i] = "0"
                
    return best_model, best_method

In [None]:
def run_expt(data, cv, fit_intercept, n_splits, sigma, main_reg_params, aux_reg_params, parallel_params, seed):
    # run an entire experiment for a given value of p, n, s. These will not use CV 

    chunk_size, threads = parallel_params
    fit_intercept_main, fit_intercept_g = fit_intercept
    np.random.seed(seed)
    X_train, y_train, X_test, y_test = data
    aux_reg = aux_reg_params["method"]
    main_reg = main_reg_params["method"]
    
    if main_reg=="Lasso":
        f_main = fit_all_f_lassos((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads, alpha_scaling=1.0, n_folds=5)
    elif main_reg=="Ridge":
        f_main = fit_all_f_ridges((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads, alpha_scaling=1.0)
    elif main_reg=="Elastic":
        f_main = fit_all_f_elastics((X_train, y_train), sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads,  alpha_scaling=1.0, n_folds=5)
    else:
        raise Exception("Main Reg Set Incorrectly")    

    if aux_reg=="Best":
        g_lassos = fit_all_g_lassos(X_train, test_data=X_test, sigma=sigma, cv=cv, fit_intercept=fit_intercept_g, n_splits=n_splits, threads=threads, alpha_scaling=1.0, n_folds=5)
        random_state1 = aux_reg_params["random_state1"]
        n_estimators1 = aux_reg_params["n_estimators1"]
        max_depth1 = aux_reg_params["max_depth1"]
        
        g_rfs1 = fit_all_g_rfs(X_train, test_data=X_test, n_splits=n_splits, n_estimators=n_estimators1, max_depth=max_depth1, random_state=random_state1, threads=threads)
        g_zeros = fit_all_g_zeros(X_train, X_test, n_splits)
        best_gs_f, best_method_f = get_best(g_lassos, g_rfs1, g_zeros, n_splits, X_test.shape[0], aux="f")
        best_gs_q, best_method_q = get_best(g_lassos, g_rfs1, g_zeros, n_splits, X_test.shape[0], aux="q")
    elif aux_reg=="0":
        best_gs = [[None for i in range(n_splits)] for i in range(X_test.shape[0])]
    else:
        raise Exception("Aux Reg Set Incorrectly")

    fo1f = fit_all_f_moms_1f(train_data=(X_train, y_train), X_test=X_test, g_bases=best_gs_f, f_bases=f_main, sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads)
    foq = fit_all_q_moms(train_data=(X_train, y_train), X_test=X_test, g_bases=best_gs_q, f_method=main_reg, sigma=sigma, cv=cv, fit_intercept=fit_intercept_main, n_splits=n_splits, threads=threads)

    if main_reg=="Lasso":
        lasso = fit_lasso((X_train, y_train), sigma, alpha_scaling=1, cv=True, fit_intercept=fit_intercept_main)
        base_preds = lasso.predict(X_test)
    elif main_reg=="Ridge":
        ridge = fit_ridge((X_train, y_train), sigma, alpha_scaling=1, cv=True, fit_intercept=fit_intercept_main)
        base_preds = ridge.predict(X_test)
    elif main_reg=="Elastic":
        elastic = fit_elastic((X_train, y_train), sigma, alpha_scaling=1, cv=True, fit_intercept=fit_intercept_main)
        base_preds = elastic.predict(X_test)
    else:
        raise Exception("Baseline Set Incorrectly")
        
    kappa=cond(X_train)
    #del(best_gs)
    del(f_main)
    gc.collect()
    
    return fo1f, foq, base_preds, y_test, kappa, best_method_f, best_method_q

In [None]:
def save_expts(dataset, cv, fit_intercept, n_splits, sigma, main_reg_params, aux_reg_params, seed, folder_path, parallel_params):

    # function to potentially parallelize experiments across various values of p, n, s and save data in pkl file
    # can use python package ray to parallelize but numpy already parallelizes a lot.
    save_data = OrderedDict()
    
    chunk_size, threads = parallel_params
    fit_intercept_main = fit_intercept[0]
    fit_intercept_g = fit_intercept[1]
    save_data["dataset"]=str(dataset)
    save_data["best_g_method"]=str("Minimize Asymptotic Variance Heuristic")
    save_data["method"]="f moments and q moments"
    save_data["main_reg_params"]=main_reg_params
    save_data["fit_intercepts_main"]=str(fit_intercept_main)
    save_data["fit_intercepts_g"]=str(fit_intercept_g)
    save_data["format_keys"] = "p,n,s"
    save_data["output"] = "fo1f, foq, lasso_preds" 
    save_data["n_splits"] = n_splits
    save_data["aux_reg_params"] = aux_reg_params
    save_data["seed"] = seed
    save_data["scale_X"] = True
    save_data["sigma"] = sigma

    if dataset=="Triazines":
        data=load_triazines(test_size=.20, random_state=seed)
    elif dataset=="Wine":
        data=load_wine()
    elif dataset=="Parkinson":
        data=load_parkinson()
    elif dataset=="Fertility":
        data=load_fertility()
    elif dataset=="Fire":
        data=load_forest_fires()
    elif dataset=="Weather":
        data=load_weather()
    
    print("Starting")
    ## De-Meaning y values (X_train and X_test values have already been scaled in loader). Need to add back mu_y to all final predictions.
    ## fit_intercept should be set to false
    assert(fit_intercept_g==False, "This should be set to false for all f regression. g_lasso has been manually set to include this")
    X_train, y_train, X_test, y_test = data
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    chunk_tests = chunk_data(X_test, y_test, chunk_size)
    all_chunks = [(X_train, y_train, chunk[0], chunk[1]) for chunk in chunk_tests]
    
    OM_fo1f_preds=[]
    OM_fo1f_nums=[]
    OM_fo1f_denoms=[]
    
    OM_foq_preds=[]
    OM_foq_nums=[]
    OM_foq_denoms=[]
    
    main_preds=[]
    y_tests=[]
    kappas=[]
    best_methods_f=[]
    best_methods_q=[]

    flatten = lambda l: np.concatenate(l).ravel()
    chunk_count=0
    for chunk in all_chunks:
        print("Chunk Count", chunk_count)
        fo1f, foq, main_pred, y_test, kappa, best_method_f, best_method_q = run_expt(data=chunk, cv=cv, fit_intercept=fit_intercept, n_splits=n_splits, sigma=sigma, main_reg_params=main_reg_params, 
                                                                                                               aux_reg_params=aux_reg_params, parallel_params=parallel_params, seed=seed)
        main_reg=main_reg_params["method"]
        
        OM_fo1f_preds.append(fo1f[0])
        OM_fo1f_nums.append(fo1f[1])
        OM_fo1f_denoms.append(fo1f[2])

        OM_foq_preds.append(foq[0])
        OM_foq_nums.append(foq[1])
        OM_foq_denoms.append(foq[2])
     
        main_preds.append(main_pred)
        y_tests.append(y_test)
        kappas.append(kappa)
        best_methods_f.append(best_method_f)
        best_methods_q.append(best_method_q)
        chunk_count+=1
        gc.collect()

    # IMPORTANT: y-values have been demeaned in the beginning. Need to add this back to ALL predictions.
    OM_fo1f_preds= flatten(OM_fo1f_preds)+mu_y
    OM_fo1f_nums= flatten(OM_fo1f_nums)
    OM_fo1f_denoms= flatten(OM_fo1f_denoms)

    OM_foq_preds= flatten(OM_foq_preds)+mu_y
    OM_foq_nums= flatten(OM_foq_nums)
    OM_foq_denoms= flatten(OM_foq_denoms)

    main_preds= flatten(main_preds)+mu_y
    y_tests= flatten(y_tests)
    best_methods_f = [item for sublist in best_methods_f for item in sublist]
    best_methods_q = [item for sublist in best_methods_q for item in sublist]
    kappas= kappas[-1]
    
    save_data["results"] = [
            OM_fo1f_preds,
            OM_fo1f_nums,
            OM_fo1f_denoms,
            OM_foq_preds,
            OM_foq_nums,
            OM_foq_denoms,
            main_preds,
            y_tests,
            kappas,
            best_methods_f,
            best_methods_q]

    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    aux_reg = str(aux_reg_params["method"])
    file_name = "OM_"+str(dataset)+"_"+str(aux_reg)+"_"+timestr+".pickle"
    
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [None]:
seed=100
scaling_params = [1.0]
splits = [10]
datasets=["Triazines", "Fire", "Fertility", "Wine", "Parkinson"]


cvs=[True]
computer_cpus = 48
main_reg_params = [({"method" : "Lasso"}, 48, computer_cpus//48), ({"method" : "Ridge"}, 12, computer_cpus//12), ({"method" : "Elastic"}, 48, computer_cpus//48)]
aux_reg_params = [{"method" : "Best", "n_estimators1" : 50, "random_state1" : seed, "max_depth1" : None}]
fit_intercept=(False, False)

path_options="Weather"
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
ray.init(object_store_memory=int(6e10), num_cpus=48, redis_password="password34623345234")
ray.register_custom_serializer(sklearn.tree._tree.Tree, use_pickle=True)

In [None]:
total=0
for aux_reg_param in aux_reg_params:
    for main_reg_param in main_reg_params:
        for split in splits:
            for sigma in scaling_params:
                for data in datasets:
                    total+=1

In [None]:
count=0
for main_reg_param, chunk_size, threads in main_reg_params:
    for aux_reg_param in aux_reg_params:
        for cv in cvs:
            for split in splits:
                for sigma in scaling_params:
                    for dataset in datasets:
                        n_split = split
                        print("{0:.0%}".format(float(count)/total)+" Done")
                        data = save_expts(dataset=dataset, cv=cv, fit_intercept=fit_intercept, n_splits=n_split, sigma=1.0, main_reg_params=main_reg_param,aux_reg_params=aux_reg_param, seed=seed, folder_path=folder_path, parallel_params=(chunk_size, threads))
                        count+=1              