In [None]:
import numpy as np
import math
import pickle
import itertools
import time
import datetime
import scipy
#import cvxpy as cvx
#import mosek
import copy

import mkl
import pickle
import os
import ray
import warnings
import psutil
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from numpy import transpose as trans
from collections import OrderedDict

import subprocess
subprocess.call("bash convert_files.sh", shell=True)
from auxiliary import is_pos_def, cond, rotate_matrix,  gen_train_data, gen_test_data
from datasets import load_parkinson, load_triazines,  load_wine, load_fertility, load_forest_fires

In [None]:
ray.init(object_store_memory=int(5e10), num_cpus=48,  redis_password="password54322423")

In [None]:
def label_y_kernel(X_train, X_test, y_train, sigma):
    
    n_train, p = X_train.shape
    n_test, _ = X_test.shape
    
    y_pseudos=[]
    
    eps=10e-10
    dists = scipy.spatial.distance.cdist(X_test, X_train)
    sq_exp_dists = scipy.exp(-dists**2/(2*sigma**2))
    
    sum_weights = np.sum(sq_exp_dists, axis=1)+eps
    y_pseudos=np.divide(sq_exp_dists @ y_train, sum_weights)
  
                
    assert np.isnan(y_pseudos).any()==False
    assert np.isnan(X_test).any()==False
            
    return np.array(X_test), np.array(y_pseudos) 

In [None]:
def fit_ridge(train_data, sigma, cv, fit_intercept, alpha_scaling=1.0):
    
    """ ridge regression fitter. If cv True uses CV to fit; if false will use alpha_scaling as a regularizer
    fit_intercept determines whether or not to fit the y-intercept in the regression. This set to false by default. """

    X_train, y_train = train_data
    n, p = X_train.shape

    from sklearn.linear_model import Ridge, RidgeCV
        
    warnings.filterwarnings("ignore")

    # Theoretically Optimal regularization and CV regularizers

    if not cv:
        alpha = alpha_scaling
        ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        ridge.fit(X_train, y_train)
    else:
        alphas = np.logspace(-6, 1, num=100)
        ridge=RidgeCV(alphas=alphas, fit_intercept=fit_intercept)
        
        # Run LassoCV with the metric for CV as MSE
        ridge.fit(X_train, y_train)

    return ridge

In [None]:
def fit_TD_reg(X_train, y_train, X_pseudo, y_pseudo, lamb, C):
    warnings.filterwarnings("ignore")
    
    n_train, p = X_train.shape
    rC = math.sqrt(C)
    X_tot = np.vstack((X_train, rC*X_pseudo))
    y_tot = np.append(y_train, rC*y_pseudo)

    
    Sig_inv = np.linalg.inv(trans(X_tot) @ X_tot + lamb * np.diag(np.ones(p)))
    Cov = trans(X_tot) @ y_tot
    beta = Sig_inv @ Cov
    
    return beta

In [None]:
@ray.remote
def fit_TD_reg_par(X_train, y_train, X_pseudo, y_pseudo, lamb, C):
    warnings.filterwarnings("ignore")
    mkl.set_num_threads(1)
    
    n_train, p = X_train.shape
    rC = math.sqrt(C)
    X_tot = np.vstack((X_train, rC*X_pseudo))
    y_tot = np.append(y_train, rC*y_pseudo)

    
    Sig_inv = np.linalg.inv(trans(X_tot) @ X_tot + lamb * np.diag(np.ones(p)))
    Cov = trans(X_tot) @ y_tot
    beta = Sig_inv @ Cov
    
    pred = X_pseudo @ beta
    
    return pred

In [None]:
@ray.remote
def CV_kernel_SP(X_train, y_train, X_test, sigma, C, best_lam):
    mkl.set_num_threads(1)
    kf = KFold(n_splits=5)
    test_errors=[]
    n_train = X_train.shape[0]
    for train_index, test_index in kf.split(X_train):
        X_pseudo, y_pseudo = label_y_kernel(X_train[train_index], X_train[test_index], y_train[train_index], sigma)
        n_pseudo, p = X_pseudo.shape
        for i in range(n_pseudo):
            beta = fit_TD_reg(X_train[train_index, :], y_train[train_index], X_pseudo[i, :], y_pseudo[i], best_lam, C)
            test_errors.append(np.linalg.norm(X_pseudo[i, :] @ beta-y_train[test_index[i]]))

    return (sigma, C, np.mean(test_errors))

In [None]:
def run_TDRegKernel_expt(data, cv, fit_intercept, sigma_grid, C_grid):
    # run an entire experiment for a given value of p, n, s. These will not use CV 
    
    #Prelim Fit
    X_train, y_train, X_test, y_test = data
    f_main = fit_ridge((X_train, y_train), sigma=1.0, cv=cv, fit_intercept=fit_intercept, alpha_scaling=1.0) 
    best_lam = f_main.alpha_
    
    n_splits=5
    if X_train.shape[0] >= 5:
        models = []
        count=0
        for sigma in sigma_grid:
            for C in C_grid:
                count+=1
                models.append(CV_kernel_SP.remote(X_train, y_train, X_test, sigma, C, best_lam))
                inter=10
                if count % inter == 0:
                    print(str(inter) + " done")
        
        models = ray.get(models)
        best_sigma, best_C, err = min(models, key=lambda x: x[-1])
        print("Labelling Final")
        X_pseudo, y_pseudo = label_y_kernel(X_train, X_test, y_train, best_sigma)
        
        print("Fitting Final")
        preds = []
        n_pseudo, p_pseudo = X_pseudo.shape
        for i in range(n_pseudo):
            preds.append(fit_TD_reg_par.remote(X_train, y_train, X_pseudo[i, :], y_pseudo[i], best_lam, best_C))
        TD_main_preds = ray.get(preds)
    else:
        TD_main_preds = f_main.predict(X_test)
        
    return TD_main_preds, y_test

In [None]:
def save_TDRegKernel_expts(dataset, fit_intercept, cv, sigma_grid, C_grid, folder_path):

    # function to potentially parallelize experiments across various values of p, n, s and save data in pkl file
    # Runs Linear Regressionon on Real Datasets
    save_data = OrderedDict()
    
    save_data["main_reg_params"]="TDLasso"
    save_data["dataset"]=str(dataset)
    save_data["fit_intercept"]=str(fit_intercept)
    save_data["sigma_grid"] = sigma_grid
    save_data["C_grid"] = C_grid
    save_data["output"] = "main_preds, y_test, mu_y"

    if dataset=="Triazines":
        data=load_triazines(test_size=.20)
    elif dataset=="Wine":
        data=load_wine()
    elif dataset=="Parkinson":
        data=load_parkinson()
    elif dataset=="Fertility":
        data=load_fertility()
    elif dataset=="Fire":
        data=load_forest_fires()
        
    X_train, y_train, X_test, y_test = data
    train_n, p = X_train.shape
    test_n, _ = X_test.shape
    #print(test_n)
    mu_y = np.mean(y_train)
    y_train = y_train - mu_y
    
    print("Starting")

    main_preds=[]
    y_tests=[] 

    main_pred, y_test = run_TDRegKernel_expt((X_train, y_train, X_test, y_test), cv=cv, fit_intercept=fit_intercept, sigma_grid=sigma_grid, C_grid=C_grid)
    main_preds=[i+mu_y for i in main_pred]
    y_tests=y_test
    
    save_data["results"] = [main_preds,
                y_tests]
        
    print("Saving Data")
    time.sleep(1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = "TDRegKernel_"+str(dataset)+"_"+timestr+".pickle"
    
    file_path = os.path.join(folder_path, file_name)
    pickle.dump(save_data, open(file_path, "wb"))
    
    return save_data

In [None]:
path_options="TDRegKernelSP_Real"
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
computer_cpus = psutil.cpu_count()
#"Fertility", "Triazines", "Fire"
datasets=["Fertility", "Triazines", "Fire", "Wine", "Parkinson"]
cv=True       
fit_intercept=False
path_options="TDRegKernelSP_Real"

sigma_grid = np.logspace(start=-2, stop=2, num=30)
C_grid = np.append(np.logspace(start=-2, stop=2, num=30), 1)
now = datetime.datetime.now()
folder_path=str(now.month)+"-"+str(now.day)+"-"+path_options
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
count=0
for dataset in datasets:
    count+=1
    #print("{0:.0%}".format(float(count)/total)+" Done")
    data_lin_reg = save_TDRegKernel_expts(dataset=dataset, fit_intercept=fit_intercept, cv=cv, sigma_grid=sigma_grid, C_grid=C_grid, folder_path=folder_path)