In [1]:
from math import fabs
import numpy as np
from numpy.linalg import norm
from sklearn.datasets import make_regression

In [2]:
SAMPLES = 100
DIVERGENCE_VALUE = 10
MAX_ITERATIONS = 10000
STOP_THRESHOLD = 0.0001
DIMENSIONS = 7
LAMBDA = 1
MU = 0.01

In [3]:
np.random.seed(0)

In [4]:
(X, y, coef) = make_regression(n_samples = SAMPLES, 
                               n_features = DIMENSIONS - 1, 
                               n_informative = DIMENSIONS - 1, 
                               effective_rank = 2,
                               n_targets = 1, 
                               coef = True,
                               bias = 3,
                               tail_strength = 0)

In [5]:
coef

array([47.93845494, 60.57119573, 63.74622774, 72.78881584, 81.19385617,
       11.56618719])

$\displaystyle \min_{\beta}\frac{1}{n}||y-X\beta||^2_{2}+\lambda||\beta||_1 + \mu||\beta||_2^2$

$L=\frac{1}{n}\displaystyle\sum_{i=1}^{n}\left(y_i - \left(\beta_0 + \displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right)\right)^2 + \lambda\displaystyle\sum_{j=0}^{6}|\beta_j| + \mu\displaystyle\sum_{j=0}^{6} \beta_j^2$

$\frac{\partial L}{\partial \beta_0} = -\frac{2}{n}\displaystyle\sum_{i=1}^{n}\left(y_i - \beta_0 -\displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right) + \lambda\frac{\beta_0}{|\beta_0|} + 2\mu\beta_0$

And for $\beta_{k\neq 0}$:

$\frac{\partial L}{\partial \beta_{k}} = -\frac{2}{n}\displaystyle\sum_{i=1}^{n}x_{ik}\left(y_i - \beta_0 -\displaystyle\sum_{j=1}^{6} \beta_j x_{ij}\right) + \lambda\frac{\beta_k}{|\beta_k|} + 2\mu\beta_k$

In [6]:
np.random.seed(0)

In [7]:
(X, y, coef) = make_regression(n_samples = SAMPLES, 
                               n_features = 6, 
                               n_informative = 6, 
                               effective_rank = 2,
                               n_targets = 1, 
                               coef = True,
                               bias = 3,
                               tail_strength = 0)

In [8]:
X = np.hstack((np.ones((SAMPLES, 1)), X))

In [9]:
# Auxiliary function to calculate gradient
# l = lambda
# m = mu
def calculate_gradient(x, y, l, m, current_params):
    db = np.zeros(DIMENSIONS)    
    
    # Common term
    common = (y - 
              current_params[0] - 
              np.sum(np.multiply(np.tile(current_params[1:], (SAMPLES, 1)), x[:, 1:]), axis=1))
    
    # Function for the regularisation factor
    def regularisation(param, l, m):
        return l * param / fabs(param) + 2 * m * param
    
    # db_0
    db[0] = - 2 / float(SAMPLES) * np.sum(common) + regularisation(current_params[0], l, m)

    # db_k, k != 0
    for k in range(1, DIMENSIONS):
        db[k] = -2 / float(SAMPLES) * np.sum(np.multiply(x[:, k], common)) + regularisation(current_params[k], l, m)
    return db

In [10]:
# Auxiliary function for backtracking line search
def line_search(current_params, gradient, beta, l, m):
    current_params = np.array(current_params)
    
    def _loss(params, l, m):
        y_pred = np.matmul(X, np.array(current_params))
        L = np.mean(np.power(y - y_pred, 2), axis = 0)
        L = L + l * np.sum(np.fabs(params)) + m * np.sum(np.power(params, 2))
        return L
        
    t = 1.0
    while _loss(current_params - t * gradient, l, m) > _loss(current_params, l, m) - \
                                                        t/2.0 * np.matmul(gradient.T, gradient):
        print((_loss(current_params - t * gradient, l, m), _loss(current_params, l, m) - t/2.0 * norm(gradient, ord = 2) ** 2), t)
        t = t * beta
        
    return t

In [11]:
calculate_gradient(X, y, 1, 0.01, [1, 2, 3, 4, 5, 6, 7])

array([-2.90698589,  0.86882003,  0.87470091,  1.01822005,  1.388318  ,
        0.79615466,  0.80103953])

In [12]:
# We shouldn't initialise to zero to prevent divisions by zero
# when computing gradients
current_params = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

In [13]:
# Coordinate gradient descent implementation
# Cycling through the dimensions
w = [np.array(current_params[:])]
it = 0
while it == 0 or (np.sum(np.abs(np.array(current_params) - np.array(w[-1]))) > STOP_THRESHOLD and \
                  np.all(np.array(current_params) - np.array(w[-1]) < DIVERGENCE_VALUE) and \
                  it < MAX_ITERATIONS):
    w.append(current_params)
    # Select the gradient for the next dimension
    d = it % DIMENSIONS
    g = np.zeros(DIMENSIONS)
    g[d] = calculate_gradient(X, y, LAMBDA, MU, current_params)[d]
    # Linear search in the direction of the selected dimension
    t = line_search(current_params, g, 0.9, LAMBDA, MU)
    print(g)
    print(t)
    # Update parameters
    current_params = list(np.array(w[-1]) - t * np.array(g))[:]
    print(current_params)
    print('---------------')
    # Wrapping up the iteration
    it = it + 1
w.append(current_params)

(32.38025789465598, 16.56687633650281) 1.0
(31.870651704369617, 17.658253083477298) 0.9
(31.415738641586533, 18.640492155754337) 0.81
(31.009340216946235, 19.52450732080367) 0.7290000000000001
(30.64603053358018, 20.320120969348075) 0.6561000000000001
(30.32103542658701, 21.036173253038037) 0.5904900000000002
(30.030146552802545, 21.680620308359003) 0.5314410000000002
(29.769648011629123, 22.26062265814787) 0.47829690000000014
(29.536253495211454, 22.782624772957853) 0.43046721000000016
(29.32705230865266, 23.252426676286838) 0.38742048900000015
(29.139462882105608, 23.675248389282924) 0.34867844010000015
(28.971192627711506, 24.0557879309794) 0.31381059609000017
(28.820203184650392, 24.39827351850623) 0.28242953648100017
(28.68468025246919, 24.706510547280377) 0.25418658283290013
(28.563007342430883, 24.98392387317711) 0.22876792454961012
(28.453742883825477, 25.233595866484166) 0.2058911320946491
(28.35560021102816, 25.45830066046052) 0.1853020188851842
(28.267430030868088, 25.660534

In [14]:
y

array([ 3.23107784e+00,  5.57248637e+00,  3.98049002e+00,  4.65304810e+00,
        9.73648501e+00,  9.03136498e-03, -1.17493866e+00,  6.77555529e+00,
        3.83997861e+00, -1.29486692e+00,  1.22982188e+00, -2.71529988e-01,
        3.76706916e+00, -2.83126721e+00,  1.25123898e+00,  4.64621926e+00,
        2.97776653e-01,  7.02508335e+00,  6.28964156e-01,  7.42215544e-01,
       -6.06038611e+00,  9.67766369e+00, -3.79968777e+00, -2.81180075e+00,
       -1.30220493e+00,  1.44606617e+01,  5.81662544e+00, -1.12786472e+00,
        1.69793584e+00,  3.23562676e+00, -3.81989179e+00,  7.24090437e+00,
        3.16259494e+00,  1.38047507e+01, -5.28828147e-01,  1.54552823e+00,
        8.80122856e+00,  2.39490138e+00,  4.20068365e+00,  3.22401915e+00,
       -9.41208139e-01,  4.76850397e+00,  1.13055370e-01, -1.74108615e+00,
        3.14439394e+00,  4.53824982e+00,  1.49203288e+00,  1.03037261e+01,
        4.91960921e+00, -4.32038498e+00, -4.16031400e+00,  5.79678577e+00,
        2.26707086e+00,  

In [15]:
np.matmul(X, [3] + list(coef))

array([ 3.23107784e+00,  5.57248637e+00,  3.98049002e+00,  4.65304810e+00,
        9.73648501e+00,  9.03136498e-03, -1.17493866e+00,  6.77555529e+00,
        3.83997861e+00, -1.29486692e+00,  1.22982188e+00, -2.71529988e-01,
        3.76706916e+00, -2.83126721e+00,  1.25123898e+00,  4.64621926e+00,
        2.97776653e-01,  7.02508335e+00,  6.28964156e-01,  7.42215544e-01,
       -6.06038611e+00,  9.67766369e+00, -3.79968777e+00, -2.81180075e+00,
       -1.30220493e+00,  1.44606617e+01,  5.81662544e+00, -1.12786472e+00,
        1.69793584e+00,  3.23562676e+00, -3.81989179e+00,  7.24090437e+00,
        3.16259494e+00,  1.38047507e+01, -5.28828147e-01,  1.54552823e+00,
        8.80122856e+00,  2.39490138e+00,  4.20068365e+00,  3.22401915e+00,
       -9.41208139e-01,  4.76850397e+00,  1.13055370e-01, -1.74108615e+00,
        3.14439394e+00,  4.53824982e+00,  1.49203288e+00,  1.03037261e+01,
        4.91960921e+00, -4.32038498e+00, -4.16031400e+00,  5.79678577e+00,
        2.26707086e+00,  

In [16]:
np.matmul(X, current_params)

array([0.09801332, 0.10509224, 0.10243451, 0.10269071, 0.11504197,
       0.09381996, 0.08891944, 0.11102937, 0.10155333, 0.0893784 ,
       0.0958946 , 0.09292256, 0.10088765, 0.08447396, 0.09657415,
       0.10486563, 0.09507919, 0.11071559, 0.09165156, 0.09401253,
       0.07666386, 0.11767071, 0.08553818, 0.08687082, 0.0916072 ,
       0.12766189, 0.10490275, 0.08891647, 0.09671899, 0.10117669,
       0.08369181, 0.11036652, 0.10073351, 0.12713829, 0.09153164,
       0.09470935, 0.11398235, 0.10137776, 0.1023353 , 0.09894002,
       0.08932772, 0.10593548, 0.09583878, 0.08854993, 0.10103604,
       0.10451343, 0.09749397, 0.11724731, 0.10329098, 0.08238581,
       0.08297386, 0.1048192 , 0.09979677, 0.11525008, 0.10209868,
       0.12624393, 0.10141795, 0.10353149, 0.08991643, 0.08187882,
       0.08818007, 0.09758354, 0.0994831 , 0.09775395, 0.12821506,
       0.09530839, 0.10764839, 0.10234   , 0.09994087, 0.09864211,
       0.10255396, 0.10766883, 0.090384  , 0.08893378, 0.11109

In [17]:
X

array([[ 1.00000000e+00,  2.53136050e-02, -7.05871533e-03,
        -2.47362825e-02,  8.95329814e-03,  9.02883088e-03,
        -3.13675812e-02],
       [ 1.00000000e+00,  1.67985995e-02,  6.09992171e-02,
        -4.37332422e-02, -1.94736823e-02,  2.66771801e-02,
         9.65437522e-03],
       [ 1.00000000e+00, -2.11169069e-02, -7.64044519e-02,
         8.26933043e-02,  7.08427722e-03,  6.64279587e-03,
         2.54460991e-02],
       [ 1.00000000e+00,  3.68546254e-02,  4.16222298e-02,
        -4.78873569e-02, -1.74137723e-02,  2.19242286e-02,
        -8.19287813e-03],
       [ 1.00000000e+00,  4.10025986e-02, -1.76455769e-02,
         4.51013144e-02, -5.12748329e-02,  7.40489870e-02,
         5.91872133e-02],
       [ 1.00000000e+00,  6.91724671e-04,  1.52283310e-01,
        -1.44441667e-01, -5.97313162e-04, -3.14592869e-02,
        -3.82771873e-02],
       [ 1.00000000e+00, -3.04781348e-02, -7.95822858e-02,
         3.48402205e-02,  5.17786554e-02, -4.12600392e-02,
        -4.6104018