In [1]:
from math import fabs
import numpy as np
from numpy.linalg import norm
from sklearn.datasets import make_regression

In [2]:
SAMPLES = 100
DIVERGENCE_VALUE = 10
MAX_ITERATIONS = 10000
STOP_THRESHOLD = 0.0001
DIMENSIONS = 7
LAMBDA = 1
MU = 0.01

In [3]:
np.random.seed(0)

In [4]:
(X, y, coef) = make_regression(n_samples = SAMPLES, 
                               n_features = DIMENSIONS - 1, 
                               n_informative = DIMENSIONS - 1, 
                               effective_rank = 2,
                               n_targets = 1, 
                               coef = True,
                               bias = 3,
                               tail_strength = 0)

In [5]:
coef

array([47.93845494, 60.57119573, 63.74622774, 72.78881584, 81.19385617,
       11.56618719])

$\displaystyle \min_{\beta}\frac{1}{n}||y-X\beta||^2_{2}+\lambda||\beta||_1 + \mu||\beta||_2^2$

$L=\frac{1}{n}\displaystyle\sum_{i=1}^{n}\left(y_i - \left(\beta_0 + \displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right)\right)^2 + \lambda\displaystyle\sum_{j=0}^{6}|\beta_j| + \mu\displaystyle\sum_{j=0}^{6} \beta_j^2$

$\frac{\partial L}{\partial \beta_0} = -\frac{2}{n}\displaystyle\sum_{i=1}^{n}\left(y_i - \beta_0 -\displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right) + \lambda\frac{\beta_0}{|\beta_0|} + 2\mu\beta_0$

And for $\beta_{k\neq 0}$:

$\frac{\partial L}{\partial \beta_{k}} = -\frac{2}{n}\displaystyle\sum_{i=1}^{n}x_{ik}\left(y_i - \beta_0 -\displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right) + \lambda\frac{\beta_k}{|\beta_k|} + 2\mu\beta_k$

In [6]:
SAMPLES = 100
K = 6

In [7]:
np.random.seed(0)

In [8]:
(X, y, coef) = make_regression(n_samples = SAMPLES, 
                               n_features = 6, 
                               n_informative = 6, 
                               effective_rank = 2,
                               n_targets = 1, 
                               coef = True,
                               bias = 3,
                               tail_strength = 0)

In [9]:
X = np.hstack((np.ones((SAMPLES, 1)), X))

In [10]:
# Auxiliary function for backtracking line search
def line_search(current_params, gradient, beta):
    current_params = np.array(current_params)
    
    def _loss(params):
        y_pred = np.matmul(X, np.array(current_params))
        return np.mean(np.power(y - y_pred, 2), axis = 0)
        
    t = 1.0
    while _loss(current_params - t * gradient) > _loss(current_params) - \
                                                 t/2.0 * norm(gradient, ord = 2) ** 2:
        t = t * beta
        
    return t

In [11]:
# Auxiliary function to calculate gradient
# l = lambda
# m = mu
def calculate_gradient(x, y, l, m, current_params):
    db = np.zeros(7)    
    
    # Common term
    common = (y - 
              current_params[0] - 
              np.sum(np.multiply(np.tile(current_params[1:], (SAMPLES, 1)), x[:, 1:]), axis=1))
    
    # Function for the regularisation factor
    def regularisation(param, l, m):
        return l * param / fabs(param) + 2 * m * param
    
    # db_0
    db[0] = - 2 / float(SAMPLES) * np.sum(common) + regularisation(current_params[0], l, m)

    # db_k, k != 0
    for k in range(1, 7):
        db[k] = -2 / float(SAMPLES) * np.sum(np.multiply(x[:, k], common)) + regularisation(current_params[k], l, m)
    return db

In [12]:
calculate_gradient(X, y, 1, 0.01, [1, 2, 3, 4, 5, 6, 7])

array([-2.90698589,  0.86882003,  0.87470091,  1.01822005,  1.388318  ,
        0.79615466,  0.80103953])

In [13]:
# We shouldn't initialise to zero to prevent divisions by zero
# when computing gradients
current_params = [1, 1, 1, 1, 1, 1, 1]

In [14]:
# Coordinate gradient descent implementation
# Cycling through the dimensions
w = [np.array(current_params[:])]
it = 0
while it == 0 or (np.sum(np.abs(np.array(current_params) - np.array(w[-1]))) > STOP_THRESHOLD and \
                  np.all(np.array(current_params) - np.array(w[-1]) < DIVERGENCE_VALUE) and \
                  it < MAX_ITERATIONS):
    w.append(current_params)
    # Select the gradient for the next dimension
    d = it % DIMENSIONS
    g = np.zeros(DIMENSIONS)
    g[d] = calculate_gradient(X, y, LAMBDA, MU, current_params)[d]
    # Linear search in the direction of the selected dimension
    t = line_search(current_params, g, 0.9)
    # Update parameters
    current_params = list(np.array(w[-1]) - t * np.array(g))[:]
    print(current_params)
    # Wrapping up the iteration
    it = it + 1
w.append(current_params)

[1.000000000000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
