ENH add ElasticNet (#230)

Co-authored-by: mathurinm <mathurin.massias@gmail.com>
mathurinm · May 13, 2022 · 4230db1 · 4230db1
1 parent a4263f3
commit 4230db1
Show file tree

Hide file tree

Showing 13 changed files with 657 additions and 106 deletions.
diff --git a/README.rst b/README.rst
@@ -9,6 +9,8 @@ Currently, the package handles the following problems:
 
 - Lasso
 - Weighted Lasso
+- ElasticNet
+- Weighted ElasticNet
 - Sparse Logistic regression
 - Weighted Group Lasso
 - Multitask Lasso

diff --git a/celer/PN_logreg.pyx b/celer/PN_logreg.pyx
@@ -13,8 +13,8 @@ from libc.math cimport fabs, sqrt, exp
 from sklearn.exceptions import ConvergenceWarning
 
 from .cython_utils cimport fdot, faxpy, fcopy, fposv, fscal, fnrm2
-from .cython_utils cimport (primal, dual, create_dual_pt, create_accel_pt,
-                            sigmoid, ST, LOGREG, dnorm_l1,
+from .cython_utils cimport (primal, dual, create_dual_pt,
+                            sigmoid, ST, LOGREG, dnorm_enet,
                             compute_Xw, compute_norms_X_col, set_prios)
 
 cdef:
@@ -35,6 +35,10 @@ def newton_celer(
     else:
         dtype = np.float32
 
+    # Enet not supported for Logreg
+    cdef floating l1_ratio = 1.0
+    cdef floating norm_w2 = 0.
+
     cdef int verbose_in = max(0, verbose - 1)
     cdef int n_samples = y.shape[0]
     cdef int n_features = w.shape[0]
@@ -102,19 +106,19 @@ def newton_celer(
     cdef bint positive = 0
 
     for t in range(max_iter):
-        p_obj = primal(LOGREG, alpha, Xw, y, w, weights_pen)
+        p_obj = primal(LOGREG, alpha, l1_ratio, Xw, y, w, weights_pen)
 
         # theta = y * sigmoid(-y * Xw)
         create_dual_pt(LOGREG, n_samples, &theta[0], &Xw[0], &y[0])
-        norm_Xtheta = dnorm_l1(
-            is_sparse, theta, X, X_data, X_indices, X_indptr,
-            screened, X_mean, weights_pen, center, positive)
+        norm_Xtheta = dnorm_enet(
+            is_sparse, theta, w, X, X_data, X_indices, X_indptr,
+            screened, X_mean, weights_pen, center, positive, alpha, l1_ratio)
 
         if norm_Xtheta > alpha:
             tmp = alpha / norm_Xtheta
             fscal(&n_samples, &tmp, &theta[0], &inc)
 
-        d_obj = dual(LOGREG, n_samples, 0., &theta[0], &y[0])
+        d_obj = dual(LOGREG, n_samples, alpha, l1_ratio, 0., norm_w2, &theta[0], &y[0])
         gap = p_obj - d_obj
 
         if t != 0 and use_accel:
@@ -165,15 +169,15 @@ def newton_celer(
             for i in range(n_samples):
                 exp_Xw[i] = exp(Xw[i])
 
-            norm_Xtheta_acc = dnorm_l1(
-                is_sparse, theta_acc, X, X_data, X_indices, X_indptr,
-                screened, X_mean, weights_pen, center, positive)
+            norm_Xtheta_acc = dnorm_enet(
+                is_sparse, theta_acc, w, X, X_data, X_indices, X_indptr,
+                screened, X_mean, weights_pen, center, positive, alpha, l1_ratio)
 
             if norm_Xtheta_acc > alpha:
                 tmp = alpha / norm_Xtheta_acc
                 fscal(&n_samples, &tmp, &theta_acc[0], &inc)
 
-            d_obj_acc = dual(LOGREG, n_samples, 0., &theta_acc[0], &y[0])
+            d_obj_acc = dual(LOGREG, n_samples, alpha, l1_ratio, 0., norm_w2, &theta_acc[0], &y[0])
             if d_obj_acc > d_obj:
                 fcopy(&n_samples, &theta_acc[0], &inc, &theta[0], &inc)
                 gap = p_obj - d_obj_acc
@@ -188,7 +192,7 @@ def newton_celer(
             break
 
 
-        set_prios(is_sparse, theta, alpha, X, X_data, X_indices, X_indptr,
+        set_prios(is_sparse, theta, w, alpha, l1_ratio, X, X_data, X_indices, X_indptr,
                   norms_X_col, weights_pen, prios, screened, radius,
                   &n_screened, 0)
 
@@ -249,6 +253,10 @@ cpdef int PN_logreg(
     cdef int ws_size = WS.shape[0]
     cdef int n_features = w.shape[0]
 
+    # Enet not supported for Logreg
+    cdef floating l1_ratio = 1.0
+    cdef floating norm_w2 = 0.
+
     if floating is double:
         dtype = np.float64
     else:
@@ -369,15 +377,15 @@ cpdef int PN_logreg(
 
         else:
             # rescale aux to create dual point
-            norm_Xaux = dnorm_l1(
-                is_sparse, aux, X, X_data, X_indices, X_indptr,
-                notin_WS, X_mean, weights_pen, center, 0)
+            norm_Xaux = dnorm_enet(
+                is_sparse, aux, w, X, X_data, X_indices, X_indptr,
+                notin_WS, X_mean, weights_pen, center, 0, alpha, l1_ratio)
 
         for i in range(n_samples):
             aux[i] /= max(1, norm_Xaux / alpha)
 
-        d_obj = dual(LOGREG, n_samples, 0, &aux[0], &y[0])
-        p_obj = primal(LOGREG, alpha, Xw, y, w, weights_pen)
+        d_obj = dual(LOGREG, n_samples, alpha, l1_ratio, 0., norm_w2, &aux[0], &y[0])
+        p_obj = primal(LOGREG, alpha, l1_ratio, Xw, y, w, weights_pen)
 
         gap = p_obj - d_obj
         if verbose_in:

diff --git a/celer/__init__.py b/celer/__init__.py
@@ -1,8 +1,10 @@
 """Celer algorithm to solve L1-type regularized problems."""
 
 from .homotopy import celer_path
-from .dropin_sklearn import (Lasso, LassoCV, LogisticRegression, GroupLasso,
-                             GroupLassoCV, MultiTaskLasso, MultiTaskLassoCV)
+from .dropin_sklearn import (ElasticNet, ElasticNetCV,
+                             GroupLasso, GroupLassoCV,
+                             Lasso, LassoCV, LogisticRegression,
+                             MultiTaskLasso, MultiTaskLassoCV)
 
 
 __version__ = '0.7dev'
diff --git a/celer/cython_utils.pxd b/celer/cython_utils.pxd
@@ -8,8 +8,10 @@ cdef int LOGREG
 
 cdef floating ST(floating, floating) nogil
 
-cdef floating dual(int, int, floating, floating *, floating *) nogil
-cdef floating primal(int, floating, floating[:], floating [:],
+cdef floating fweighted_norm_w2(floating[:], floating[:]) nogil
+
+cdef floating dual(int, int, floating, floating, floating, floating, floating *, floating *) nogil
+cdef floating primal(int, floating, floating, floating[:], floating [:],
                      floating [:], floating[:]) nogil
 cdef void create_dual_pt(int, int, floating *, floating *, floating *) nogil
 
@@ -27,7 +29,7 @@ cdef void fposv(char *, int *, int *, floating *,
                 int *, floating *, int *, int *) nogil
 
 cdef int create_accel_pt(
-    int, int, int, int, floating, floating *, floating *,
+    int, int, int, int, floating *, floating *,
     floating *, floating[:, :], floating[:, :], floating[:], floating[:])
 
 
@@ -42,11 +44,11 @@ cpdef void compute_norms_X_col(
     floating[:], int[:], int[:], floating[:])
 
 
-cpdef floating dnorm_l1(
-        bint, floating[:], floating[::1, :], floating[:],
-        int[:], int[:], int[:], floating[:], floating[:], bint, bint) nogil
+cpdef floating dnorm_enet(
+        bint, floating[:], floating[:], floating[::1, :], floating[:],
+        int[:], int[:], int[:], floating[:], floating[:], bint, bint, floating, floating) nogil
 
 
 cdef void set_prios(
-    bint, floating[:], floating, floating[::1, :], floating[:], int[:],
+    bint, floating[:], floating[:], floating, floating, floating[::1, :], floating[:], int[:],
     int[:], floating[:], floating[:], floating[:], int[:], floating, int *, bint) nogil
diff --git a/celer/cython_utils.pyx b/celer/cython_utils.pyx
@@ -109,6 +109,20 @@ cdef inline floating Nh(floating x) nogil:
         return INFINITY  # not - INFINITY
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef floating fweighted_norm_w2(floating[:] w, floating[:] weights) nogil: 
+    cdef floating weighted_norm = 0.
+    cdef int n_features = w.shape[0]
+    cdef int j
+
+    for j in range(n_features):
+        if weights[j] == INFINITY:
+            continue
+        weighted_norm += weights[j] * w[j] ** 2
+    return weighted_norm
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
@@ -141,7 +155,7 @@ cdef floating primal_logreg(
 @cython.wraparound(False)
 @cython.cdivision(True)
 cdef floating primal_lasso(
-        floating alpha, floating[:] R, floating[:] w,
+        floating alpha, floating l1_ratio, floating[:] R, floating[:] w,
         floating[:] weights) nogil:
     cdef int n_samples = R.shape[0]
     cdef int n_features = w.shape[0]
@@ -152,24 +166,27 @@ cdef floating primal_lasso(
     for j in range(n_features):
         # avoid nan when weights[j] is INFINITY
         if w[j]:
-            p_obj += alpha * weights[j] * fabs(w[j])
+            p_obj += alpha * weights[j] * (
+                     l1_ratio * fabs(w[j]) +
+                     0.5 * (1. - l1_ratio) * w[j] ** 2)
     return p_obj
 
 
 cdef floating primal(
-    int pb, floating alpha, floating[:] R, floating[:] y,
+    int pb, floating alpha, floating l1_ratio, floating[:] R, floating[:] y,
     floating[:] w, floating[:] weights) nogil:
     if pb == LASSO:
-        return primal_lasso(alpha, R, w, weights)
+        return primal_lasso(alpha, l1_ratio, R, w, weights)
     else:
         return primal_logreg(alpha, R, y, w, weights)
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-cdef floating dual_lasso(int n_samples, floating norm_y2,
-                         floating * theta, floating * y) nogil:
+cdef floating dual_enet(int n_samples, floating alpha, floating l1_ratio,
+                         floating norm_y2, floating norm_w2, floating * theta,
+                         floating * y) nogil:
     """Theta must be feasible"""
     cdef int i
     cdef floating d_obj = 0.
@@ -178,6 +195,8 @@ cdef floating dual_lasso(int n_samples, floating norm_y2,
         d_obj -= (y[i] - n_samples * theta[i]) ** 2
     d_obj *= 0.5 / n_samples
     d_obj += norm_y2 / (2. * n_samples)
+    if l1_ratio != 1.0:
+        d_obj -= 0.5 * alpha * (1 - l1_ratio) * norm_w2
     return d_obj
 
 
@@ -195,11 +214,10 @@ cdef floating dual_logreg(int n_samples, floating * theta,
     return d_obj
 
 
-cdef floating dual(int pb, int n_samples, floating norm_y2,
-                   floating * theta, floating * y) nogil:
-
+cdef floating dual(int pb, int n_samples, floating alpha, floating l1_ratio,
+                   floating norm_y2, floating norm_w2, floating * theta, floating * y) nogil:
     if pb == LASSO:
-        return dual_lasso(n_samples, norm_y2, &theta[0], &y[0])
+        return dual_enet(n_samples, alpha, l1_ratio, norm_y2, norm_w2, &theta[0], &y[0])
     else:
         return dual_logreg(n_samples, &theta[0], &y[0])
 
@@ -226,7 +244,6 @@ cdef void create_dual_pt(
 @cython.cdivision(True)
 cdef int create_accel_pt(
     int pb, int n_samples, int epoch, int gap_freq,
-    floating alpha,
     floating * R, floating * out, floating * last_K_R, floating[:, :] U,
     floating[:, :] UtU, floating[:] onesK, floating[:] y):
 
@@ -365,11 +382,11 @@ cpdef void compute_Xw(
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-cpdef floating dnorm_l1(
-        bint is_sparse, floating[:] theta, floating[::1, :] X,
+cpdef floating dnorm_enet(
+        bint is_sparse, floating[:] theta, floating[:] w, floating[::1, :] X,
         floating[:] X_data, int[:] X_indices, int[:] X_indptr, int[:] skip,
         floating[:] X_mean, floating[:] weights, bint center,
-        bint positive) nogil:
+        bint positive, floating alpha, floating l1_ratio) nogil:
     """compute norm(X[:, ~skip].T.dot(theta), ord=inf)"""
     cdef int n_samples = theta.shape[0]
     cdef int n_features = skip.shape[0]
@@ -399,6 +416,10 @@ cpdef floating dnorm_l1(
         else:
             Xj_theta = fdot(&n_samples, &theta[0], &inc, &X[0, j], &inc)
 
+        # minus sign to consider the choice theta = y - Xw and not theta = Xw -y
+        if l1_ratio != 1:
+            Xj_theta -= alpha * (1 - l1_ratio) * weights[j] * w[j]
+
         if not positive:
             Xj_theta = fabs(Xj_theta)
         scal = max(scal, Xj_theta / weights[j])
@@ -409,14 +430,15 @@ cpdef floating dnorm_l1(
 @cython.wraparound(False)
 @cython.cdivision(True)
 cdef void set_prios(
-    bint is_sparse, floating[:] theta, floating alpha,
+    bint is_sparse, floating[:] theta, floating[:] w, floating alpha, floating l1_ratio,
     floating[::1, :] X, floating[:] X_data, int[:] X_indices, int[:] X_indptr,
     floating[:] norms_X_col, floating[:] weights, floating[:] prios,
     int[:] screened, floating radius, int * n_screened, bint positive) nogil:
     cdef int i, j, startptr, endptr
     cdef floating Xj_theta
     cdef int n_samples = theta.shape[0]
     cdef int n_features = prios.shape[0]
+    cdef floating norms_X_col_j = 0.
 
     # TODO we do not substract theta_sum, which seems to indicate that theta
     # is always centered...
@@ -433,11 +455,17 @@ cdef void set_prios(
         else:
             Xj_theta = fdot(&n_samples, &theta[0], &inc, &X[0, j], &inc)
 
+        norms_X_col_j = norms_X_col[j]
+        if l1_ratio != 1:
+            Xj_theta -= alpha * (1 - l1_ratio) * weights[j] * w[j] 
+
+            norms_X_col_j = norms_X_col_j ** 2 
+            norms_X_col_j += sqrt(norms_X_col_j + alpha * (1 - l1_ratio) * weights[j])
 
         if positive:
-            prios[j] = fabs(Xj_theta - alpha * weights[j]) / norms_X_col[j]
+            prios[j] = fabs(Xj_theta - alpha * l1_ratio * weights[j]) / norms_X_col_j
         else:
-            prios[j] = (alpha * weights[j] - fabs(Xj_theta)) / norms_X_col[j]
+            prios[j] = (alpha * l1_ratio * weights[j] - fabs(Xj_theta)) / norms_X_col_j
 
         if prios[j] > radius:
             screened[j] = True