# AdaGrad 
Presented during ML reading group, 2019-11-5.

Author: Ioana Plajer, ioana.plajer@unitbv.ro
    
Reviewed: Lucian Sasu

In [24]:
#%matplotlib notebook
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


print(f'Numpy version: {np.__version__}')

Numpy version: 1.16.4


# AdaGrad

The [AdaGrad paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) comes with the idea of using different learning rates for each feature. Hence, instead of:
$$w_{t+1} = w_{t} - \eta \nabla J_{w}(w_t)$$
AdaGrad comes with:
$$w_{t+1}^{(j)} = w_{t}^{(j)} - \frac{\eta}{\sqrt{\varepsilon + \sum_{\tau=1}^{t}{(g_{\tau}^{(j)})^2}}} \nabla J_{w}(w_t^{(j)})$$
where $g_{\tau}$ is the gradient of error function at iteration $\tau$, $g_{\tau}^{(j)}$ is the partial derivative of the 
error function in direction of the $j$ - th feature, at iteration $\tau$, $m$ - is the number of features, i.e. 
$$g_{\tau}^{(j)} = \nabla J_{w}(w_\tau^{(j)})$$

AdaGrad specifies the update as:
$$w_{t+1} = w_{t} - \frac{\eta}{\sqrt{\varepsilon I + diag(G_t)}} \nabla J_{w}(w_t)$$
where:
* $\eta$ is the initial learning rate (hyperparameter)
* $n$ is the number of items in (mini)batch
* $G_t = \sum\limits_{\tau=1}^t \mathbf{g}_\tau \mathbf{g}_\tau^T$
* $diag(A)$ is the diagonal form of the square matrix $A$
* $\varepsilon > 0$ is used to avoid division by 0
* $I$ is the unit matrix of size $m$
* $G_t^{(j,j)} = \sum\limits_{\tau = 1}^{t}{(g_\tau^{(j)})^2}$ is the sum of the squared partial derivatives in direction 
of the $j$ - th feature from the first iteration up to the current iteration
 


In a more detailed form, the update of the weights through AdaGrad is done by:
$$\left[\begin{array}{c} 
         w_{t+1}^{(1)}\\
         w_{t+1}^{(2)}\\
         \vdots\\
         w_{t+1}^{(m)}
         \end{array}\right] = \left[\begin{array}{c}
         w_{t}^{(1)}\\
         w_{t}^{(2)}\\
         \vdots\\
         w_{t}^{(m)}\end{array}\right] - \eta\left(\left[\begin{array}{cccc} \varepsilon & 0 & \ldots & 0\\
                                                   0 & \varepsilon & \ldots & 0\\
                                                   \vdots & \vdots & \ddots & \vdots \\
                                                   0 & 0 & \ldots & 0\end{array}\right]+
                                                   \left[\begin{array}{cccc}
                                                   G_{t}^{(1,1)} & 0 & \ldots & 0\\
                                                   0 & G_{t}^{(2,2)} & \ldots & 0\\
                                                   \vdots & \vdots & \ddots & \vdots\\
                                                   0 & 0 & \ldots & G_{t}^{(m,m)}\end{array}\right]\right)^{-1/2}
                                                    \left[\begin{array}{c}
                                                    g_t^{(1)}\\
                                                    g_t^{(2)}\\
                                                    \vdots\\
                                                    g_t^{(m)}\end{array}\right]$$
which simplifies to:
$$\left[\begin{array}{c} 
         w_{t+1}^{(1)}\\
         w_{t+1}^{(2)}\\
         \vdots\\
         w_{t+1}^{(m)}
         \end{array}\right] = \left[\begin{array}{c}
         w_{t}^{(1)}\\
         w_{t}^{(2)}\\
         \vdots\\
         w_{t}^{(m)}\end{array}\right] - \left[\begin{array}{c}
                                               \frac{\eta}{\sqrt{\varepsilon+G_{t}^{(1,1)}}}g_t^{(1)}\\
                                               \frac{\eta}{\sqrt{\varepsilon+G_{t}^{(2,2)}}}g_t^{(2)}\\
                                               \vdots\\
                                               \frac{\eta}{\sqrt{\varepsilon+G_{t}^{(m,m)}}}g_t^{(m)}
                                               \end{array}\right]$$

## Generate data

In [5]:
from scipy.sparse import random #to generate sparse data

np.random.seed(10) # for reproducibility
m_data = 100
n_data = 4 #number of features of the data
_scales = np.array([1,10, 10,1]) # play with these... 


_parameters = np.array([3, 0.5, 1, 7]) 

def gen_data(m, n, scales, parameters, add_noise=True):
    # Adagrad is designed especially for sparse data.
    # produce: X, a 2d tensor with m lines and n columns
    # and X[:, k] uniformly distributed in [-scale_k, scale_k] with the first and the last column containing sparse data 
    #(approx 75% of the elements are 0)
    #
    # To generate a sparse data matrix with m rows and n columns
    # and random values use S = random(m, n, density=0.25).A, where density = density of the data. S will be the 
    # resulting matrix 
    # more information at https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.random.html
    #
    # To obtain X - generate a random matrix with X[:, k] uniformly distributed in [-scale_k, scale_k]
    # set X[:, 0] and X[:, -1] to 0 and add matrix S with the sparse data.
    #
    # let y be X@parameters.T + epsilon, with epsilon ~ N(0, 1); y is a vector with m elements
    # parameters - the ideal weights, used to produce output values y
    #
    
   

    return X, y

In [6]:
X, y = gen_data(m_data, n_data, _scales, _parameters)

## Define error function, gradient, inference

In [7]:
def model_estimate(X, w):
    '''Computes the linear regression estimation on the dataset X, using coefficients w
    :param X: a 2d tensor with m_data lines and n_data columns
    :param w: a 1d tensor with n_data coefficients (no intercept)
    :return: a 1d tensor with m_data elements y_hat = w @X.T
    '''
    
    return y_hat 

In [8]:
def J(X, y, w):
    """Computes the mean squared error of model. See the picture from last week's sheet.
    :param X: input values, of shape m_data x n_data
    :param y: ground truth, column vector with m_data values
    :param w: column with n_data coeffieicnts for the linear form 
    :return: a scalar value >= 0
    :use the same formula as in the exercise from last week
    """
   
    return err

In [9]:
def gradient(X, y, w):
    '''Commputes the gradients to be used for gradient descent. 
    :param X: 2d tensor with training data
    :param y: 1d tensor with y.shape[0] == W.shape[0]
    :param w: 1d tensor with current values of the coefficients
    :return: gradients to be used for gradient descent. 
    :use the same formula as in the exercise from last week
    '''
    
    return grad## implement

In [10]:
#The function from last week for comparison
def gd_no_momentum(X, y, w_init, eta=1e-1, thresh = 0.001):
    '''Iterates with gradient descent algorithm
    :param X: 2d tensor with data
    :param y: 1d tensor, ground truth 
    :param w_init: 1d tensor with the X.shape[1] initial coefficients
    :param eta: the learning rate hyperparameter
    :param thresh: the threshold for the gradient norm (to stop iterations)
    :return: the list of succesive errors and the found w* vector 
    '''
    w = w_init
    w_err=[]
    
    while True:
        grad = gradient(X, y, w)
        err = J(X, y, w)
        w_err.append(err)
        w = w - eta * grad
        
        if np.linalg.norm(grad) < thresh:
            break;
    
    return w_err, w

In [12]:
w_init = np.array([0, 0, 0, 0])
errors, w_best = gd_no_momentum(X, y, w_init, 0.0001)

In [401]:
print(f'How many iterations were made: {len(errors)}')

How many iterations were made: 1001


In [14]:
w_best

In [16]:
fig, axes = plt.subplots()
axes.plot(list(range(len(errors))), errors)
axes.set_xlabel('Epochs')
axes.set_ylabel('Error')
axes.set_title('Optimization without momentum')

## Momentum algorithm

In [404]:
#The function from last week for comparison
def gd_with_momentum(X, y, w_init, eta=1e-1, gamma = 0.9, thresh = 0.001):
    """Applies gradient descent with momentum coefficient
    :params: as in gd_no_momentum
    :param gamma: momentum coefficient
    :param thresh: the threshold for the gradient norm (to stop iterations)
    :return: the list of succesive errors and the found w* vector 
    """
    w = w_init
    w_err=[]
   
    delta = np.zeros_like(w)
    while True:
        grad = gradient(X, y, w)
        err = J(X, y, w)
        w_err.append(err)
        w_new = w + gamma * delta - eta * grad
        delta = w_new - w
        w = w_new
        
        if np.linalg.norm(grad) < thresh :
            break;
    return w_err, w

In [405]:
w_init = np.array([0, 0, 0, 0])
errors_momentum, w_best = gd_with_momentum(X, y, w_init,0.0001, 0.9)

In [17]:
print(f'How many iterations were made: {len(errors_momentum)}')

In [18]:
w_best

In [19]:
fig, axes = plt.subplots()
axes.plot(list(range(len(errors_momentum))), errors_momentum)
axes.set_xlabel('Epochs')
axes.set_ylabel('Error')
axes.set_title('Optimization with momentum')

## Apply AdaGrad and report resulting $\eta$'s

In [20]:
def ada_grad(X, y, w_init, eta_init=1e-1, eps = 0.001,thresh = 0.001):
    '''Iterates with gradient descent algorithm
    :param X: 2d tensor with data
    :param y: 1d tensor, ground truth 
    :param w_init: 1d tensor with the X.shape[1] initial coefficients
    :param eps: the epsilon value from the AdaGrad formula
    :param thresh: the threshold for the gradient norm (to stop iterations)
    :return: the list of succesive errors w_err, the found w - the estimated feature vector 
    :and rates the learning rates after the final iteration 
    '''
     
   
    return w_err, w, rates

In [409]:
w_init = np.array([0,0,0,0])
adaGerr, w_ada_best, rates = ada_grad(X, y, w_init)


In [21]:
print(f'How many iterations were made: {len(adaGerr)}')

In [22]:
w_ada_best

In [23]:
fig, axes = plt.subplots()
axes.plot(list(range(len(adaGerr))),adaGerr)
axes.set_xlabel('Epochs')
axes.set_ylabel('Error')
axes.set_title('Optimization with AdaGrad')