In [0]:
# ---
# main libraries
# ---
import time
import random
import numpy as np
# ---
# display libraries
# ---
from tqdm import tqdm_notebook as tqdm
import plotly.graph_objects as go

# Studies of gradient descent optimization in a big data context.

Our main optimization problem is :

$$ min_{W \in \mathbb{R^d}} \frac{1}{2} \|XW - Y\|^2 + \lambda R(W) $$

where :

* $X \in \mathbb{R^{pd}} \qquad \text{the matrix containing the coefficients of the equations system}$
* $Y \in \mathbb{R^p} \qquad \text{the target vector}$
* $d \qquad\text{the data dimension (big there!)}$
* $R \qquad \text{regularization term}$
* $\lambda \qquad \text{the term to manage regularization importance}$

## Generate gaussian data

* Generative function + plot

In [0]:
def gaussian_data(n_samples, n_features, variance):
    W = np.random.random((n_features, 1))
    X = np.random.normal(0, variance, (n_samples, n_features))
    Y = X @ W

    # let's add some noise to our labels
    # Y += np.random.random(Y.shape)

    return X, Y, W

# ---
# generate big data !
# ---
N_SAMPLES, N_FEATURES, VARIANCE = 10_000, 11, 0.1
X, Y, W = gaussian_data(N_SAMPLES, N_FEATURES, VARIANCE)

# ---
# let's visualize our data (only 1 by one feature can be explored) in 2D
# ---
fig = go.Figure(
    data=[
        # markers
        go.Scatter(
            x=X[:, 0].squeeze(),
            y=Y.squeeze(),
            mode='markers',
            name=f'gaussian data [{N_SAMPLES}, {VARIANCE}]'
        )
    ],
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

# A method for stochastic optimization : Adam

* https://arxiv.org/pdf/1412.6980.pdf

In [0]:
# ---
# Algorithm 1: Adam.
# ---
# Require: α: Stepsize
# Require: β1, β2 ∈ [0, 1): Exponential decay rates for the moment estimates
# Require: f(θ): Stochastic objective function with parameters θ
# Require: θ0: Initial parameter vector
# m0 ← 0 (Initialize 1st moment vector)
# v0 ← 0 (Initialize 2nd moment vector)
# t ← 0 (Initialize timestep)
# while θt not converged do
# t ← t + 1
# gt ← ∇θft(θt−1) (Get gradients w.r.t. stochastic objective at timestep t)
# mt ← β1 · mt−1 + (1 − β1) · gt (Update biased first moment estimate)
# vt ← β2 · vt−1 + (1 − β2) · g_2_t (Update biased second raw moment estimate)
# mb t ← mt/(1 − β_t_1) (Compute bias-corrected first moment estimate)
# vbt ← vt/(1 − β_t_2) (Compute bias-corrected second raw moment estimate)
# θt ← θt−1 − α · mb t/(√vbt + eps) (Update parameters)
# end while
# return θt (Resulting parameters)

class MyAdam:
    def __init__(self,
                 learning_rate=1e-03,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-07,
                 verbose=True):
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.verbose = verbose

    @staticmethod
    def gradient_function(X, Y, W, regularization, alpha, l1_ratio):
        if regularization is None:
            return 2 * X.T @ (X @ W - Y)

        if regularization is 'l2':
            return 2 * X.T @ (X @ W - Y) + alpha * W

        # ---
        # define the huber function as a lambda
        # ---
        f_huber = lambda w, alpha: w if abs(w) < alpha else alpha * np.sign(w)
        f_huber = np.vectorize(f_huber)

        if regularization is 'l1':
            return 2 * X.T @ (X @ W - Y) + alpha * f_huber(W, alpha)
        
        if regularization is 'elasticnet':
            # `l1_ratio=1` means it's the same as `regularization=l1`
            l1_reg = l1_ratio * alpha * f_huber(W, alpha)
            l2_reg = (1 - l1_ratio) * alpha * W

            return 2 * X.T @ (X @ W - Y) + l1_reg + l2_reg
        
        raise ValueError(f"Regularization `{regularization}` isn't yet implemented. Please choose between [l1, l2, elasticnet].")

    def fit(self, X, Y,
            regularization=None,
            alpha=None,
            l1_ratio=0.5,
            batch_size=64,
            n_epochs=5_000,
            stop_criterion=1e-05):
        
        start_time = time.time()

        # momentum vectors initialisation
        m = np.zeros(X.shape[1])
        v = np.zeros(X.shape[1])

        # `weights` holds the vector we are trying to optimize
        weights = np.zeros((X.shape[1], 1))

        # list to hold Mean Squared Error through the algorithm journey
        mse = []
        # mse.append(float('inf'))

        # iterate till convergence
        for epoch in tqdm(range(n_epochs)):
            # getting next batch
            idx = np.random.choice(X.shape[0], batch_size)
            batch_x = X[idx]
            batch_y = Y[idx]

            # calculate gradient with or without regularization
            g = self.gradient_function(batch_x, batch_y, weights,
                                       regularization, alpha, l1_ratio)

            # let's do the maths !
            m = self.beta_1 * m + (1 - self.beta_1) * g
            v = self.beta_2 * v + (1 - self.beta_2) * g ** 2
            m_bias_corr = m / (1 - self.beta_1 ** (epoch+1))
            v_bias_corr = v / (1 - self.beta_2 ** (epoch+1))

            # update weights with gradient descent
            weights = weights - self.learning_rate * (m_bias_corr / (np.sqrt(v_bias_corr) + self.epsilon))

            # calculate the actual distance to labels
            mse.append(np.linalg.norm(X @ weights - Y))

            # print progression
            # if self.verbose and epoch % 100 == 0:
            #    print(f'MSE distance {mse[-1]:6f}')

            # did we found an acceptable solution ?
            if mse[-1] <= stop_criterion:
                print(f'Found an acceptable solution at epoch {epoch+1}')
                break
            
            """
            removed these methods to do a benchmark of all methods !
            ---
            # did we met a local minimum ?
            if mse[-1] >= mse[-2]:
                print('-' * 20)
                print(f'Did not converge to optimal solution.')
                print(f'Solution found in {time.time() - start_time} at epoch {epoch}')
                print(f'MSE distance {mse[-1]}')
                print('-' * 20)
                break
            """
        
        if self.verbose:
            print(f'Execution time : {time.time() - start_time} seconds')
            print(f'MSE distance   : {mse[-1]}')
            print('-' * 20)

        return weights, mse, time.time() - start_time

We'll do benchmark of different executions for a big matrix.

* Raw gradient descent with Adam (our base).
* Stochastic (batch) gradient descent with Adam.
* Regularization with Huber function (l1).
* Tikhonov regularization (l2).

In [0]:
# ---
# compute results
# ---
my_adam = MyAdam(learning_rate=1e-03,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-07,
                 verbose=True)

names = [
    f'batch_size={N_SAMPLES}',
    'batch_size=64',
    'batch_size=64, regularization=l1, alpha=0.01',
    'batch_size=64, regularization=l2, alpha=0.01',
    'batch_size=64, regularization=elasticnet, alpha=0.01, l1_ratio=0.5'
]

results = [
    my_adam.fit(X, Y, batch_size=N_SAMPLES, regularization=None),
    my_adam.fit(X, Y, regularization=None),
    my_adam.fit(X, Y, regularization='l1', alpha=0.01),
    my_adam.fit(X, Y, regularization='l2', alpha=0.01),
    my_adam.fit(X, Y, regularization='elasticnet', alpha=0.01, l1_ratio=0.5),
]

# ---
# plot results
# ---
traces = []
for name, result in zip(names, results):
    traces.append(
        go.Scatter(
            y=result[1],
            mode='lines',
            name=f'{name} --- exec_time={round(result[2], 5)} seconds'
        )
    )

fig = go.Figure(
    data=traces,
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3450
Execution time : 13.202727317810059 seconds
MSE distance   : 9.981131635029921e-06
--------------------


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3542
Execution time : 4.053577661514282 seconds
MSE distance   : 9.925784849972477e-06
--------------------


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Execution time : 8.098379611968994 seconds
MSE distance   : 0.008695183593532085
--------------------


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Execution time : 5.893715858459473 seconds
MSE distance   : 0.5458764558162387
--------------------


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Execution time : 8.131827592849731 seconds
MSE distance   : 0.25091972460211653
--------------------


* Let's check if our results are good with cross-validation
    * To be faster, we'll only try the fastest method, which is Adam without regularization with `batch_size=64`

In [0]:
my_adam = MyAdam(learning_rate=1e-03,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-07,
                 verbose=False)

n_times = 10

mse = np.zeros(n_times)
exec_time = np.zeros(n_times)

for i in range(n_times):
    result = my_adam.fit(X, Y, regularization=None)
    mse[i] = result[1][-1]
    exec_time[i] = result[2]

print(f'Mean MSE distance   : {mse.mean()}')
print(f'Mean execution time : {exec_time.mean()}')

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3559


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3569


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3528


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3515


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3562


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3570


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3543


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3561


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3552


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Found an acceptable solution at epoch 3534
Mean MSE distance   : 9.953520303237026e-06
Mean execution time : 4.124321818351746


# Efficient greedy coordinate descent for composite problems

* https://arxiv.org/pdf/1810.06999v1.pdf

* I will implement GS-s coordinate decision rule because I found it easier.


In [0]:
# ---
# Algorithm 1: L1 Steepest Coordinate Descent
# ---
# Initialize: α0 := 0 ∈ Rn.
# for t = 0, 1, . . . , until convergence do
#   Select coordinate it as in GS-s, GS-r, or GS-q.
#   Find α+_it via gradient (10) or line-search (11).
#   Compute α(t+1)_it as in (12).
# end for

class MySCD:
    def __init__(self,
                lr=1e-03,
                epsilon=0.001,
                verbose=True):
        self.lr = lr
        self.epsilon = epsilon
        self.verbose = verbose

    def fit(self, X, Y,
            delta=0.01,
            n_epochs=5_000,
            stop_criterion=1e-05):
        
        start_time = time.time()

        # `weights` holds the vector we are trying to optimize
        weights = np.zeros(X.shape[1])
        
        # list to hold Mean Squared Error through the algorithm journey
        mse = []
        
        # iterate till convergence
        for epoch in tqdm(range(n_epochs)):
            # do the maths !
            idx = self.gs_s(X, Y, weights, delta)
            grad = 2 * X[:, idx].T @ (X[:, idx] * weights[idx] - Y.squeeze())
            weights_plus = self.S(weights[idx] - self.lr * grad, delta * self.lr)
            weights[idx] = weights_plus if weights[idx] * weights_plus >= 0 else 0

            # calculate the actual distance to labels
            mse.append(np.linalg.norm(X @ weights - Y))

            # print progression
            # if self.verbose and epoch % 100 == 0:
            #    print(f'Epoch {epoch:6d} | MSE distance {mse[-1]:6f}')

            # did we found an acceptable solution ?
            if mse[-1] <= stop_criterion:
                print(f'Found an acceptable solution at epoch {epoch+1}')
                break
        
        print(f'Execution time : {time.time() - start_time} seconds')
        print(f'MSE distance   : {mse[-1]}')
        print('-' * 20)

        return weights, mse, time.time() - start_time
        
    def gs_s(self, X, Y, weights, delta):
        best_idx = 0
        best_val = -1
        
        for current_idx in range(X.shape[1]):
            current_val = np.abs(self.s(X, Y, weights, current_idx, delta))

            if current_val > best_val:
                best_val = current_val
                best_idx = current_idx
        
        return best_idx
    
    def S(self, w, delta):
        #f = lambda w, delta: w - delta * np.sign(w) if np.abs(w) >= delta else 0
        #f = np.vectorize(f)
        #return f(W, delta)
        return w - delta * np.sign(w) if np.abs(w) >= delta else 0
    
    def s(self, X, Y, weights, idx, delta):
        grad = 2 * X[:, idx].T @ (X[:, idx] * weights[idx] - Y.squeeze())

        if weights[idx] == 0:
            return self.S(grad, delta) 
        else:
            return grad + delta * np.sign(weights[idx])

* Run the algorithm and plot MSE distance through the journey !

In [0]:
# ---
# compute results
# ---

my_scd = MySCD(lr=1e-03, epsilon=1e-07, verbose=True)

names = [
    'delta=0.01'
]

results = [
    my_scd.fit(X, Y, delta=0.01, n_epochs=100)
]

# ---
# plot results
# ---
traces = []
for name, result in zip(names, results):
    traces.append(
        go.Scatter(
            y=result[1],
            mode='lines',
            name=f'{name} --- exec_time={round(result[2], 5)} seconds'
        )
    )

fig = go.Figure(
    data=traces,
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

HBox(children=(IntProgress(value=0), HTML(value='')))

Execution time : 27.012022256851196 seconds
MSE distance   : 2622.457318103276
--------------------


## Conclusions

* On doit choisir les paramètres `alpha` de chaque régularisation par validation croisée pour chaque jeu de données.

* Adam est particulièrement intéressant dans ce cas d'usage. On s'apperçoit que les courbes se superposent au niveau des performances, ce qui offre un gain de temps considérable, itérer sur un batch de 64 données donne les mêmes résultats qu'itérer sur la totalité du jeu de données (10_000 ici).

* Les régularisations n'apportent pas grand chose au niveau des performances (j'ai utilisé un `alpha=0.01`) et alonge le temps de calcul. L'avantage de ces régularisations réside dans le fait qu'elles 'fixent' les coefficients de la régression pour qu'ils soient distribués de façon moins anarchique.

* L'algorithme de descente de gradient par coordonnées n'offre pas les mêmes résultats (mon implémentation est bancale).