In [1]:
import numpy as np
import ray
import scipy
import math
import datetime
#import mkl
import numpy.linalg as la
import os
from sklearn.linear_model import RidgeCV
import pickle

In [2]:
ray.init(num_cpus=48, redis_password="123456")

2020-01-21 11:16:10,096	INFO node.py:469 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2020-01-21_11-16-10_19349/logs.
2020-01-21 11:16:10,206	INFO services.py:407 -- Waiting for redis server at 127.0.0.1:37404 to respond...
2020-01-21 11:16:10,325	INFO services.py:407 -- Waiting for redis server at 127.0.0.1:45969 to respond...
2020-01-21 11:16:10,330	INFO services.py:804 -- Starting Redis shard with 10.0 GB max memory.
2020-01-21 11:16:10,348	INFO node.py:483 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2020-01-21_11-16-10_19349/logs.
2020-01-21 11:16:10,353	INFO services.py:1427 -- Starting the Plasma object store with 20.0 GB memory using /dev/shm.


{'node_ip_address': '169.229.48.80',
 'redis_address': '169.229.48.80:37404',
 'object_store_address': '/tmp/ray/session_2020-01-21_11-16-10_19349/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-01-21_11-16-10_19349/sockets/raylet',
 'webui_url': None}

In [3]:
def eigs(M):
    
    eigenValues, eigenVectors = la.eig(M)

    idx = eigenValues.argsort()[::-1]   
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[:,idx]
    
    return eigenValues, eigenVectors

In [4]:
def gen_train_model(d, r, T, train_n):
    
    u, s, v = la.svd(np.random.normal(size=(d, r)))
    B = u[:, :r]
    
    train_alphas = [np.random.normal(size=r, scale=1/math.sqrt(r)) for i in range(T)]
    train_data=[]
    for i in range(T):
        X=np.random.normal(size=(train_n, d))
        y = X @ B @ train_alphas[i] + np.random.normal(size=train_n)
        train_data.append((X, y))
        
    return train_data, B, train_alphas

In [5]:
def gen_test_model(d, r, B, test_n):
    
    alpha = np.random.normal(size=r, scale=1/math.sqrt(r))

    X=np.random.normal(size=(test_n, d))
    y = X @ B @ alpha + np.random.normal(size=test_n)
        
    return (X, y), alpha

In [6]:
def MoM(train_data):
    
    T = len(train_data)
    d = train_data[0][0].shape[1]
    
    total_n=0
    M = np.zeros(shape=(d, d))
    for i in range(T):
        data = train_data[i]
        X, y = data
        num = y.shape[0]
        total_n += num
        scaled_X = (X.T * y).T
        M += (scaled_X).T @ scaled_X
    M = 1/float(total_n) * M
    
    return M

In [7]:
def rPCA(M, r):
    
    eigVals, eigVecs = eigs(M)
    
    return eigVecs[:, :r], eigVecs[:, r:]

In [8]:
def MetaLR(train_data, r, test_data):
    
    T = len(train_data)
    d = train_data[0][0].shape[1]
    
    M_est = MoM(train_data)
    B1, B2 = rPCA(M_est, r)
    
    X,y = test_data
    X_low = X @ B1
    alpha_LR = LR((X_low, y))
    beta_LR = B1 @ alpha_LR
    
    alpha_RR = ridge_regression((X_low, y))
    beta_RR = B1 @ alpha_RR
    
    return B1, beta_LR, beta_RR

In [9]:
def LR(test_data):
    
    X, y = test_data
    beta_LR = la.pinv((X.T @ X)) @ X.T @ y
    
    return beta_LR

In [10]:
def ridge_regression(test_data):
    
    X, y = test_data
    reg = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], fit_intercept=False)
    reg.fit(test_data[0], test_data[1])

    return reg.coef_

In [11]:
@ray.remote
def run_expt(d, r, T, train_n, test_n):
    
    #mkl.set_num_threads(1)
    train_data, B, train_alphas = gen_train_model(d=d, r=r, T=T, train_n=train_n)
    test_data, alpha_test = gen_test_model(d, r, B, test_n)
    B_meta, beta_meta_LR, beta_meta_RR = MetaLR(train_data, r, test_data)
    beta_LR = LR(test_data)
    beta_RR = ridge_regression(test_data)

    beta_true = B @ alpha_test

    return np.linalg.norm(beta_meta_LR-beta_true), np.linalg.norm(beta_meta_RR-beta_true), np.linalg.norm(beta_LR-beta_true), np.linalg.norm(beta_RR-beta_true)

In [12]:
def run_parallel_expt(d, r, T, train_n, test_n, reps):
    
    meta_LR_errs=[]
    meta_RR_errs=[]
    LR_errs=[]
    ridge_errs=[]

    data = ray.get([run_expt.remote(d, r, T, train_n, test_n) for num in range(reps)])
    meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs = zip(*data)
    
    return meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs

In [None]:
d=250
r=5
train_n=50
test_n=1000
reps=50

In [None]:
T_list = [200, 400, 800, 1600, 3200, 6400, 12800, 12800*2]

In [None]:
def collect_data(d, r, T_list, train_n, test_n, reps):
    
    metaLRmus=[]
    metaLRstd=[]
    
    metaRRmus=[]
    metaRRstd=[]

    LRmus=[]
    LRstd=[]
    
    ridgemus=[]
    ridgestd=[]
    
    for t in T_list:
        print(t)
        meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs = run_parallel_expt(d, r, t, train_n, test_n, reps)

        metaLRmus.append(np.mean(meta_LR_errs))
        metaLRstd.append(np.std(meta_LR_errs)) 

        metaRRmus.append(np.mean(meta_RR_errs))
        metaRRstd.append(np.std(meta_RR_errs)) 


        LRmus.append(np.mean(LR_errs))
        LRstd.append(np.std(LR_errs)) 

        ridgemus.append(np.mean(ridge_errs))
        ridgestd.append(np.std(ridge_errs)) 
        
    return (metaLRmus, metaLRstd), (metaRRmus, metaRRstd), (LRmus, LRstd), (ridgemus, ridgestd)

In [None]:
meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs = collect_data(d, r, T_list, train_n, test_n, reps)

In [None]:
save_data = {"meta_LR_errs" : meta_LR_errs, "meta_RR_errs" :  meta_RR_errs, "LR_errs" : LR_errs, "ridge_errs" : ridge_errs}

In [None]:
save_data["T_list"] =  T_list
save_data["d"] = d
save_data["r"] = r
save_data["train_n"] = train_n
save_data["test_n"] = test_n
save_data["reps"] = reps

In [None]:
meta_LR_errs

In [None]:
LR_errs

In [None]:
params = "d="+str(d)+",r="+str(r)+",train_n="+str(train_n)+",test_n="+str(test_n)

In [None]:
file_name = "Meta,"+str(params)+".pickle"
folder_name = "Data"
file_path = os.path.join(folder_name, file_name)
pickle.dump(save_data, open(file_path, "wb"))

In [13]:
d=250
r=5
T=5
test_n=50
reps=50

In [14]:
train_n_list = [200, 400, 800, 1600, 3200, 6400, 12800, 12800*2, 12800*4, 12800*8]

In [15]:
def collect_data_two(d, r, T, train_n_list, test_n, reps):
    
    metaLRmus=[]
    metaLRstd=[]
    
    metaRRmus=[]
    metaRRstd=[]

    LRmus=[]
    LRstd=[]
    
    ridgemus=[]
    ridgestd=[]
    
    for train_n in train_n_list:
        print(train_n)
        meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs = run_parallel_expt(d, r, T, train_n, test_n, reps)

        metaLRmus.append(np.mean(meta_LR_errs))
        metaLRstd.append(np.std(meta_LR_errs)) 

        metaRRmus.append(np.mean(meta_RR_errs))
        metaRRstd.append(np.std(meta_RR_errs)) 


        LRmus.append(np.mean(LR_errs))
        LRstd.append(np.std(LR_errs)) 

        ridgemus.append(np.mean(ridge_errs))
        ridgestd.append(np.std(ridge_errs)) 
        
    return (metaLRmus, metaLRstd), (metaRRmus, metaRRstd), (LRmus, LRstd), (ridgemus, ridgestd)

In [16]:
meta_LR_errs, meta_RR_errs, LR_errs, ridge_errs = collect_data_two(d, r, T, train_n_list, test_n, reps)

200
400
800
1600
3200
6400
12800
25600
51200
102400


In [17]:
save_data = {"meta_LR_errs" : meta_LR_errs, "meta_RR_errs" :  meta_RR_errs, "LR_errs" : LR_errs, "ridge_errs" : ridge_errs}

In [18]:
save_data["train_n_list"] =  train_n_list
save_data["d"] = d
save_data["r"] = r
save_data["T"] = T
save_data["test_n"] = test_n
save_data["reps"] = reps

In [19]:
meta_LR_errs

([1.0403261497819054,
  1.0322152738166275,
  1.0222502992773077,
  0.9857749422854022,
  0.8854153674462011,
  0.8710563454429616,
  0.8676156939695596,
  0.7956075758343925,
  0.7416193543967622,
  0.6799367561980745],
 [0.2809080890711224,
  0.29384769557973134,
  0.2963589364912514,
  0.29306850185057737,
  0.2223742413441995,
  0.2751128933854435,
  0.299820979095013,
  0.2458481880314302,
  0.20463032186848037,
  0.24034387922623493])

In [20]:
LR_errs

([0.9943879282685006,
  0.9964168895640239,
  1.0168620572364815,
  0.993445348282015,
  0.9707447530941228,
  0.9926002155230571,
  1.0336180171220446,
  0.9778886590510113,
  0.97184120635318,
  0.992397619828775],
 [0.22677043311723694,
  0.2360985513835468,
  0.24266581846039484,
  0.2439514421771052,
  0.2132004688989472,
  0.22720370465828513,
  0.28824642849312837,
  0.23412770699383587,
  0.1835103607820434,
  0.22725914773773329])

In [21]:
params = "d="+str(d)+",r="+str(r)+",T="+str(T)+",test_n="+str(test_n)

In [22]:
file_name = "Meta,"+str(params)+".pickle"
folder_name = "Data"
file_path = os.path.join(folder_name, file_name)
pickle.dump(save_data, open(file_path, "wb"))