In [1]:
import numpy as np
import math
import pickle
import itertools
import time
import datetime
#import cvxpy as cvx
#import mosek
import copy

import mkl
import pickle
import os
import ray
import warnings
import psutil
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from numpy import transpose as trans
from collections import OrderedDict

import subprocess
subprocess.call("bash convert_files.sh", shell=True)
from auxiliary import is_pos_def, cond, rotate_matrix,  gen_train_data, gen_test_data
from datasets import load_parkinson, load_triazines,  load_wine, load_fertility, load_forest_fires

In [2]:
ray.init(object_store_memory=int(5e10), num_cpus=48,  redis_password="password54322423")

2020-04-11 22:00:52,419	INFO resource_spec.py:212 -- Starting Ray with 186.38 GiB memory available for workers and up to 46.57 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '169.229.49.177',
 'redis_address': '169.229.49.177:33092',
 'object_store_address': '/tmp/ray/session_2020-04-11_22-00-52_417166_37291/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-04-11_22-00-52_417166_37291/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2020-04-11_22-00-52_417166_37291'}

In [3]:
def fit_lasso(train_data, sigma, cv, fit_intercept, alpha_scaling, n_folds=5):
    
    """ Lasso fitter. If cv True uses CV to fit; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train, y_train)
        
    return lasso

In [4]:
@ray.remote
def fit_lasso_par(train_data, sigma, cv, fit_intercept, alpha_scaling, n_folds=5):
    
    """ Lasso fitter. If cv True uses CV to fit; if false will use alpha_scaling * \sqrt{2 log p/n} as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """
    mkl.set_num_threads(1)
    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Lasso, LassoCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling*sigma*math.sqrt(2* (math.log(p)/n))
        lasso = Lasso(alpha=alpha, max_iter=5000, fit_intercept=fit_intercept)
        lasso.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        lasso=LassoCV(max_iter=5000, cv=n_folds, alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        lasso.fit(X_train, y_train)
    beta = lasso.coef_
    y_pred = beta @ X_train[-1, :]
        
    return y_pred

In [5]:
def run_TDLassoSP_expt(data, cv, fit_intercept):
    # run an entire experiment for a given value of p, n, s. These will not use CV 
    
    #Prelim Fit
    X_train, y_train, X_test, y_test = data
    f_main = fit_lasso((X_train, y_train), sigma=1.0, cv=cv, fit_intercept=fit_intercept, alpha_scaling=1.0) 
    y_pseudo_preds = f_main.predict(X_test)
    
    # TD fitting
    preds = []
    n_test, p  = X_test.shape
    for i in range(n_test):
        X = np.vstack((X_train, X_test[i, :]))
        y = np.hstack((y_train, y_pseudo_preds[i]))
        preds.append(fit_lasso_par.remote((X, y), sigma=1.0, cv=cv, fit_intercept=fit_intercept, alpha_scaling=1.0))
    main_preds = ray.get(preds)
    
    return main_preds, y_test

In [6]:
def save_TDLasso_expts(dataset, fit_intercept, cv, folder_path):

    # function to potentially parallelize experiments across various values of p, n, s and save data in pkl file
    # Runs Linear Regressionon on Real Datasets
    save_data = OrderedDict()
    
    save_data["main_reg_params"]="TDLasso"
    save_data["dataset"]=str(dataset)
    save_data["fit_intercept"]=str(fit_intercept)
    save_data["output"] = "main_preds, y_test, mu_y"

    if dataset=="Triazines":
        data=load_triazines(test_size=.20)
    elif dataset=="Wine":
        data=load_wine()
    elif dataset=="Parkinson":
        data=load_parkinson()
    elif dataset=="Fertility":
        data=load_fertility()
    elif dataset=="Fire":
        data=load_forest_fires()
        
    X_train, y_train, X_test, y_test = data
    train_n, p = X_train.shape
    test_n, _ = X_test.shape
    #print(test_n)
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    print("Starting")

    main_preds=[]
    y_tests=[] 


    main_pred, y_test= run_TDLassoSP_expt((X_train, y_train, X_test, y_test), cv=cv, fit_intercept=fit_intercept)
    main_preds = [i+mu_y for i in main_pred]
    y_tests=y_test
    
    save_data["results"] = [main_preds,
                y_tests]
        
    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = "TDLassoSP_"+str(dataset)+"_"+timestr+".pickle"
    
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [7]:
path_options="TDLassoSP_Real"
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [8]:
computer_cpus = psutil.cpu_count()
datasets=["Fertility", "Triazines", "Fire", "Wine", "Parkinson"]
cv=True       
fit_intercept=False
path_options="TDLassoSP_Real"

now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
count=0
for dataset in datasets:
    count+=1
    #print("{0:.0%}".format(float(count)/total)+" Done")
    data_lin_reg = save_TDLasso_expts(dataset=dataset, fit_intercept=fit_intercept, cv=cv, folder_path=folder_path)

Starting
Saving Data
Starting
Saving Data
Starting
Saving Data
Starting
Saving Data
Starting
