From cc8995741506b5d4e237cb5dc693659cf12c499f Mon Sep 17 00:00:00 2001 From: fabianp Date: Tue, 21 Mar 2017 15:20:04 +0100 Subject: [PATCH 1/5] started work on BCD --- copt/gradient_descent.py | 4 +- copt/stochastic.py | 80 +++++++++++++++++++++++++++++++++++++--- copt/utils.py | 2 +- 3 files changed, 77 insertions(+), 9 deletions(-) diff --git a/copt/gradient_descent.py b/copt/gradient_descent.py index 707a82fe..2e135676 100644 --- a/copt/gradient_descent.py +++ b/copt/gradient_descent.py @@ -1,7 +1,8 @@ import warnings import numpy as np -from scipy import optimize, linalg +from scipy import optimize, linalg, sparse from datetime import datetime +from numba import njit # .. local imports .. from .utils import DummyProx @@ -285,7 +286,6 @@ def minimize_APGD( trace_time=trace_time) - def minimize_DavisYin( fun, fun_deriv, g_prox, h_prox, y0, alpha=1.0, beta=1.0, tol=1e-6, max_iter=1000, g_prox_args=(), h_prox_args=(), diff --git a/copt/stochastic.py b/copt/stochastic.py index 8bcd5af7..6dffa0c2 100644 --- a/copt/stochastic.py +++ b/copt/stochastic.py @@ -193,7 +193,7 @@ def _factory_sparse_SAGA(f, g): d[~idx] = 0. @njit(nogil=True) - def epoch_iteration_template( + def _saga_algorithm( x, memory_gradient, gradient_average, step_size, max_iter, job_id, tol, stop_flag, trace, trace_x): @@ -216,19 +216,19 @@ def epoch_iteration_template( p += x[j_idx] * A_data[j] grad_i = partial_gradient(p, b[i]) - mem_i = memory_gradient[i] # .. update coefficients .. for j in range(A_indptr[i], A_indptr[i+1]): j_idx = A_indices[j] - incr = (grad_i - mem_i) * A_data[j] + d[j_idx] * ( - gradient_average[j_idx] + f_alpha * x[j_idx]) + incr = (grad_i - memory_gradient[i]) * A_data[j] + incr += d[j_idx] * (gradient_average[j_idx] + f_alpha * x[j_idx]) x[j_idx] = prox(x[j_idx] - step_size * incr, step_size * d[j_idx]) # .. update memory terms .. for j in range(A_indptr[i], A_indptr[i+1]): j_idx = A_indices[j] - gradient_average[j_idx] += (grad_i - memory_gradient[i]) * A_data[j] / n_samples + gradient_average[j_idx] += ( + grad_i - memory_gradient[i]) * A_data[j] / n_samples memory_gradient[i] = grad_i if job_id == 0: @@ -253,5 +253,73 @@ def epoch_iteration_template( break return it, cert - return epoch_iteration_template + return _saga_algorithm + +def minimize_BCD(f, g=None, x0=None, step_size=None, max_iter=100): + """Block Coordinate Descent + + Parameters + ---------- + f + g + x0 + + Returns + ------- + + """ + + if x0 is None: + xk = np.zeros(f.n_features) + else: + xk = np.array(x0, copy=True) + if g is None: + g = utils.DummyProx() + if step_size is None: + step_size = 1. / (3 * f.lipschitz_constant()) + + Ax = f.A.dot(xk) + f_alpha = f.alpha + n_features, n_samples = f.A.shape + + partial_gradient = f.partial_gradient_factory() + prox = g.prox_factory() + + @njit(nogil=True) + def _bcd_algorithm(x, A_csc_data, A_csc_indices, A_csc_indptr, b): + feature_indices = np.arange(n_features) + for it in range(max_iter): + np.random.shuffle(feature_indices) + for j in feature_indices: + grad_j = 0. + for i_indptr in range(A_csc_indptr[j], A_csc_indptr[j+1]): + # get the current sample + i_idx = A_csc_indices[i_indptr] + grad_j += partial_gradient(Ax[i_idx], b[i_idx]) * A_csc_data[i_indptr] / n_samples + x_new = prox(x[j] - step_size * (grad_j + f_alpha * x[j]), step_size) + for i_indptr in range(A_csc_indptr[j], A_csc_indptr[j+1]): + i_idx = A_csc_indices[i_indptr] + Ax[i_indptr] += A_csc_data[i_indptr] * (x_new - x[j]) + x[j] = x_new + + X_csc = sparse.csc_matrix(f.A) + + trace_func = [] + start = datetime.now() + trace_time = [(start - datetime.now()).total_seconds()] + + counter = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for job_idx in range(max_iter): + idx = np.random.randint(0, n_features, n_features) + futures.append(executor.submit( + _bcd_algorithm, xk, X_csc.data, X_csc.indices, X_csc.indptr, idx, f.b)) + for _ in concurrent.futures.as_completed(futures): + if counter % n_jobs == 0: + trace_time.append((datetime.now() - start).total_seconds()) + trace_func.append(full_loss(xk)) + print(counter, trace_func[-1]) + counter += 1 + return trace_func, trace_time, xk diff --git a/copt/utils.py b/copt/utils.py index 09052985..98915590 100644 --- a/copt/utils.py +++ b/copt/utils.py @@ -69,7 +69,7 @@ class SquaredLoss: def __init__(self, A, b, alpha='auto'): self.b = b - self.A = sparse.csr_matrix(A) + self.A = A self.n_features = self.A.shape[1] if alpha == 'auto': self.alpha = 1. / A.shape[0] From 90f15f4390c4ae4c4f1ec00c670e64dfbe31c6d5 Mon Sep 17 00:00:00 2001 From: fabianp Date: Tue, 21 Mar 2017 21:57:50 +0100 Subject: [PATCH 2/5] BCD --- copt/__init__.py | 2 +- copt/stochastic.py | 56 ++++++++++++++++++++++++++++------------ tests/test_stochastic.py | 7 ++--- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/copt/__init__.py b/copt/__init__.py index f6b5f661..c24d3c68 100644 --- a/copt/__init__.py +++ b/copt/__init__.py @@ -2,7 +2,7 @@ from .gradient_descent import minimize_PGD, minimize_DavisYin, minimize_APGD from .primal_dual import fmin_CondatVu -from .stochastic import minimize_SAGA +from .stochastic import minimize_SAGA, minimize_BCD from .utils import LogisticLoss, SquaredLoss, L1Norm, TotalVariation2D from . import datasets from . import tv_prox diff --git a/copt/stochastic.py b/copt/stochastic.py index 6dffa0c2..2bf92910 100644 --- a/copt/stochastic.py +++ b/copt/stochastic.py @@ -98,7 +98,6 @@ def minimize_SAGA( trace_func = [] trace_time = [] - # .. iterate on epochs .. with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for job_id in range(n_jobs): @@ -256,7 +255,9 @@ def _saga_algorithm( return _saga_algorithm -def minimize_BCD(f, g=None, x0=None, step_size=None, max_iter=100): +def minimize_BCD( + f, g=None, x0=None, step_size=None, max_iter=500, trace=False, verbose=False, + n_jobs=1): """Block Coordinate Descent Parameters @@ -277,17 +278,28 @@ def minimize_BCD(f, g=None, x0=None, step_size=None, max_iter=100): if g is None: g = utils.DummyProx() if step_size is None: - step_size = 1. / (3 * f.lipschitz_constant()) + step_size = 2. / f.lipschitz_constant() Ax = f.A.dot(xk) f_alpha = f.alpha - n_features, n_samples = f.A.shape + n_samples, n_features = f.A.shape + success = False partial_gradient = f.partial_gradient_factory() prox = g.prox_factory() + start_time = datetime.now() + if trace: + trace_x = np.zeros((max_iter, n_features)) + else: + trace_x = np.zeros((0, 0)) + stop_flag = np.zeros(1, dtype=np.bool) + trace_func = [] + trace_time = [] + + @njit(nogil=True) - def _bcd_algorithm(x, A_csc_data, A_csc_indices, A_csc_indptr, b): + def _bcd_algorithm(x, Ax, A_csc_data, A_csc_indices, A_csc_indptr, b): feature_indices = np.arange(n_features) for it in range(max_iter): np.random.shuffle(feature_indices) @@ -300,8 +312,9 @@ def _bcd_algorithm(x, A_csc_data, A_csc_indices, A_csc_indptr, b): x_new = prox(x[j] - step_size * (grad_j + f_alpha * x[j]), step_size) for i_indptr in range(A_csc_indptr[j], A_csc_indptr[j+1]): i_idx = A_csc_indices[i_indptr] - Ax[i_indptr] += A_csc_data[i_indptr] * (x_new - x[j]) + Ax[i_idx] += A_csc_data[i_indptr] * (x_new - x[j]) x[j] = x_new + return it, None X_csc = sparse.csc_matrix(f.A) @@ -309,17 +322,26 @@ def _bcd_algorithm(x, A_csc_data, A_csc_indices, A_csc_indptr, b): start = datetime.now() trace_time = [(start - datetime.now()).total_seconds()] - counter = 0 with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] - for job_idx in range(max_iter): - idx = np.random.randint(0, n_features, n_features) + for job_id in range(n_jobs): futures.append(executor.submit( - _bcd_algorithm, xk, X_csc.data, X_csc.indices, X_csc.indptr, idx, f.b)) - for _ in concurrent.futures.as_completed(futures): - if counter % n_jobs == 0: - trace_time.append((datetime.now() - start).total_seconds()) - trace_func.append(full_loss(xk)) - print(counter, trace_func[-1]) - counter += 1 - return trace_func, trace_time, xk + _bcd_algorithm, + xk, Ax, X_csc.data, X_csc.indices, X_csc.indptr, f.b)) + concurrent.futures.wait(futures) + + n_iter, certificate = futures[0].result() + if trace: + delta = (datetime.now() - start_time).total_seconds() + trace_time = np.linspace(0, delta, n_iter) + if verbose: + print('Computing trace') + # .. compute function values .. + trace_func = [] + for i in range(n_iter): + # TODO: could be parallelized + trace_func.append(f(trace_x[i]) + g(trace_x[i])) + return optimize.OptimizeResult( + x=xk, success=success, nit=n_iter, trace_func=trace_func, trace_time=trace_time, + certificate=certificate) + diff --git a/tests/test_stochastic.py b/tests/test_stochastic.py index 36a2ae1c..92a40b4c 100644 --- a/tests/test_stochastic.py +++ b/tests/test_stochastic.py @@ -12,9 +12,10 @@ def test_optimize(): for alpha in np.logspace(-1, 3, 3): for X in (X_dense, X_sparse): - f = cp.LogisticLoss(X, y, alpha) - opt = cp.minimize_SAGA(f) - assert np.linalg.norm(f.gradient(opt.x)) < 1e-5 + for alg in (cp.minimize_SAGA, cp.minimize_BCD): + f = cp.LogisticLoss(X, y, alpha) + opt = alg(f) + assert np.linalg.norm(f.gradient(opt.x)) < 1e-5 def test_optimize_prox(): From eef74c2769ae1ffe221b634f8caebcaa86bbfaa0 Mon Sep 17 00:00:00 2001 From: Fabian Pedregosa Date: Tue, 21 Mar 2017 22:03:03 +0100 Subject: [PATCH 3/5] Update README.rst --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 84be225c..95644ab8 100644 --- a/README.rst +++ b/README.rst @@ -14,9 +14,9 @@ Project implementing some optimization routines in Python. Philosophy ========== - * No compiled code. State of the art performance. - * Automate as much as possible (line-search whenever possible). - * Optimization-centric API. + * Ø compiled code. State of the art performance. + * Modular, extensible, general-purpose optimization library. + * Emphasis on large-scale problems. License From ce93cd3b63661093eeacb0d83c4c98b4c220f1d5 Mon Sep 17 00:00:00 2001 From: Fabian Pedregosa Date: Tue, 21 Mar 2017 22:04:00 +0100 Subject: [PATCH 4/5] Update README.rst --- README.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/README.rst b/README.rst index 95644ab8..eb1b2a15 100644 --- a/README.rst +++ b/README.rst @@ -16,7 +16,6 @@ Philosophy * Ø compiled code. State of the art performance. * Modular, extensible, general-purpose optimization library. - * Emphasis on large-scale problems. License From 55566335168b506edf78611cf24f5b5e4798a4dd Mon Sep 17 00:00:00 2001 From: fabianp Date: Tue, 21 Mar 2017 22:33:04 +0100 Subject: [PATCH 5/5] misc --- copt/stochastic.py | 10 +++++++--- tests/test_stochastic.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/copt/stochastic.py b/copt/stochastic.py index 2bf92910..2f373efd 100644 --- a/copt/stochastic.py +++ b/copt/stochastic.py @@ -299,9 +299,10 @@ def minimize_BCD( @njit(nogil=True) - def _bcd_algorithm(x, Ax, A_csc_data, A_csc_indices, A_csc_indptr, b): + def _bcd_algorithm( + x, Ax, A_csc_data, A_csc_indices, A_csc_indptr, b, trace_x): feature_indices = np.arange(n_features) - for it in range(max_iter): + for it in range(1, max_iter): np.random.shuffle(feature_indices) for j in feature_indices: grad_j = 0. @@ -314,6 +315,8 @@ def _bcd_algorithm(x, Ax, A_csc_data, A_csc_indices, A_csc_indptr, b): i_idx = A_csc_indices[i_indptr] Ax[i_idx] += A_csc_data[i_indptr] * (x_new - x[j]) x[j] = x_new + if trace: + trace_x[it, :] = x return it, None X_csc = sparse.csc_matrix(f.A) @@ -327,7 +330,8 @@ def _bcd_algorithm(x, Ax, A_csc_data, A_csc_indices, A_csc_indptr, b): for job_id in range(n_jobs): futures.append(executor.submit( _bcd_algorithm, - xk, Ax, X_csc.data, X_csc.indices, X_csc.indptr, f.b)) + xk, Ax, X_csc.data, X_csc.indices, X_csc.indptr, f.b, + trace_x)) concurrent.futures.wait(futures) n_iter, certificate = futures[0].result() diff --git a/tests/test_stochastic.py b/tests/test_stochastic.py index 92a40b4c..c56c8a21 100644 --- a/tests/test_stochastic.py +++ b/tests/test_stochastic.py @@ -14,21 +14,22 @@ def test_optimize(): for X in (X_dense, X_sparse): for alg in (cp.minimize_SAGA, cp.minimize_BCD): f = cp.LogisticLoss(X, y, alpha) - opt = alg(f) + opt = alg(f, trace=True) assert np.linalg.norm(f.gradient(opt.x)) < 1e-5 def test_optimize_prox(): for alpha in np.logspace(-1, 3, 3): for X in (X_dense, X_sparse): - f = cp.LogisticLoss(X, y) - g = cp.L1Norm(alpha) - opt = cp.minimize_SAGA(f, g) - gamma = 1. / f.lipschitz_constant() - gmap = (opt.x - g.prox(opt.x - gamma * f.gradient(opt.x), gamma)) / gamma - assert np.linalg.norm(gmap) < 1e-5 + for alg in (cp.minimize_SAGA, cp.minimize_BCD): + f = cp.LogisticLoss(X, y) + g = cp.L1Norm(alpha) + opt = alg(f, g) + gamma = 1. / f.lipschitz_constant() + gmap = (opt.x - g.prox(opt.x - gamma * f.gradient(opt.x), gamma)) / gamma + assert np.linalg.norm(gmap) < 1e-5 + -# # # def test_prox_groups(): # """Test sparse problems with group structure