In [None]:
import numpy as np
import math
import pickle
import itertools
import time
import datetime
import scipy
#import cvxpy as cvx
#import mosek
import copy

import mkl
import pickle
import os
import ray
import warnings
import psutil
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from numpy import transpose as trans
from collections import OrderedDict

import subprocess
subprocess.call("bash convert_files.sh", shell=True)
from auxiliary import is_pos_def, cond, rotate_matrix,  gen_train_data, gen_test_data
from datasets import load_parkinson, load_triazines,  load_wine, load_fertility, load_forest_fires

In [None]:
ray.init(object_store_memory=int(5e10), num_cpus=48,  redis_password="password54322423")

In [None]:
def fit_ridge(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0):
    
    """ ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train, y_train)

    return ridge

In [None]:
def build_M(X_train, X_test, lamb):
    n_train, p = X_train.shape
    test_dim = X_test.ndim
    
    if test_dim==1:
        n_test = 1
    elif test_dim==2:
        n_test, p = X_test.shape
        
    n_tot = n_train+n_test
    X = np.vstack((X_train, X_test))
    
    Ainv = np.linalg.inv(trans(X) @ X + lamb * np.eye(p))
    C = np.eye(n_tot) - X @ Ainv @ trans(X)
    v = np.diag(C)
    C_scaled = C/v[None,:]
    M = trans(C_scaled) @ C_scaled
    
    return M

In [None]:
def fit_TDVals(X_train, y_train, X_pseudo, y_pseudo, lamb_ridge, lamb_trans):
    warnings.filterwarnings("ignore")
    
    n_train = len(X_train)
    
    test_dim = X_pseudo.ndim
    if test_dim==1:
        n_pseudo = 1
    elif test_dim==2:
        n_pseudo, p = X_pseudo.shape
        
    M = build_M(X_train, X_pseudo, lamb_ridge)
    M1 = M[:n_train, n_train:]
    M2 = M[n_train:, n_train:]
    
    Sig_inv = np.linalg.inv(M2 + lamb_trans * np.diag(np.ones(n_pseudo)))

    Cov = -trans(M1) @ y_train + lamb_trans * y_pseudo
    y_preds = Sig_inv * Cov
    
    return y_preds

In [None]:
@ray.remote
def CV_vals(X_train, y_train, X_test, lamb_trans):
    
    mkl.set_num_threads(1)
    kf = KFold(n_splits=5)
    test_errors=[]
    for train_index, test_index in kf.split(X_train):
        ridge = fit_ridge((X_train[train_index], y_train[train_index]), sigma=1.0, cv=True, fit_intercept=False, alpha_scaling=1.0)
        X_pseudo = X_train[test_index]
        y_pseudo = ridge.predict(X_pseudo)
        lamb_ridge = ridge.alpha_
        n_pseudo = len(y_pseudo)
        for i in range(n_pseudo):
            y_preds = fit_TDVals(X_train[train_index], y_train[train_index], np.array(X_pseudo[i]), np.array(y_pseudo[i]), lamb_ridge, lamb_trans)
            test_errors.append(np.linalg.norm(y_preds-y_train[test_index[i]]))
    
    return (lamb_ridge, lamb_trans, np.mean(test_errors))

In [None]:
def run_TDVals_expt(data, cv, fit_intercept, lamb_trans_grid):
    # run an entire experiment for a given value of p, n, s. These will not use CV 
    
    #Prelim Fit
    X_train, y_train, X_test, y_test = data
    f_main = fit_ridge((X_train, y_train), sigma=1.0, cv=cv, fit_intercept=fit_intercept, alpha_scaling=1.0) 
    best_lam_ridge = f_main.alpha_
    
    n_splits=5
    if X_train.shape[0] >= 5:
        models = []
        count=0
        for lamb_trans in lamb_trans_grid:
            count+=1
            models.append(CV_vals.remote(X_train, y_train, X_test, lamb_trans))
            inter=10
            if count % inter == 0:
                print(str(inter) + " done")

        models = ray.get(models)
        best_lam_ridge, best_lam_trans, err = min(models, key=lambda x: x[-1])
        print("Fitting Final")
        y_pseudo = f_main.predict(X_test)
        TDVals_preds = fit_TDVals(X_train, y_train, X_test, y_pseudo, best_lam_ridge, best_lam_trans)
    else:
        TDVals_preds = f_main.predict(X_test)
        
    return TDVals_preds, y_test

In [None]:
def save_TDRegKernel_expts(dataset, fit_intercept, cv, lamb_trans_grid, folder_path):

    # function to potentially parallelize experiments across various values of p, n, s and save data in pkl file
    # Runs Linear Regressionon on Real Datasets
    save_data = OrderedDict()
    
    save_data["main_reg_params"]="TDRidge"
    save_data["dataset"]=str(dataset)
    save_data["fit_intercept"]=str(fit_intercept)
    save_data["lamb_trans_grid"] = lamb_trans_grid
    save_data["output"] = "main_preds, y_test, mu_y"

    if dataset=="Triazines":
        data=load_triazines(test_size=.20)
    elif dataset=="Wine":
        data=load_wine()
    elif dataset=="Parkinson":
        data=load_parkinson()
    elif dataset=="Fertility":
        data=load_fertility()
    elif dataset=="Fire":
        data=load_forest_fires()
        
    X_train, y_train, X_test, y_test = data
    train_n, p = X_train.shape
    test_n, _ = X_test.shape
    #print(test_n)
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    print("Starting")

    main_preds=[]
    y_tests=[] 

    main_pred, y_test = run_TDVals_expt((X_train, y_train, X_test, y_test), cv=cv, fit_intercept=fit_intercept, lamb_trans_grid = lamb_trans_grid)
    main_preds=[i+mu_y for i in main_pred]
    y_tests=y_test
    
    save_data["results"] = [main_preds,
                y_tests]
        
    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = "TDVals_"+str(dataset)+"_"+timestr+".pickle"
    
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [None]:
path_options="TDValsSP_Real"
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
computer_cpus = psutil.cpu_count()
#"Fertility", "Triazines", "Fire"
datasets=["Fertility", "Triazines", "Fire", "Wine", "Parkinson"]
cv=True       
fit_intercept=False
path_options="TDValsSP_Real"

lamb_trans_grid = np.logspace(start=-6, stop=3, num=30)
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
count=0
for dataset in datasets:
    count+=1
    #print("{0:.0%}".format(float(count)/total)+" Done")
    data_lin_reg = save_TDRegKernel_expts(dataset=dataset, fit_intercept=fit_intercept, cv=cv, lamb_trans_grid=lamb_trans_grid, folder_path=folder_path)