# Introduction

In [1]:
import numpy as np
import tensorflow as tf

np.set_printoptions(precision=2, suppress=True)

In [17]:
tf.__version__

'2.3.0'

<div><img src="../../assets/01_ML_Concepts/LR_GD_1.png", width=500, height=100></div>

<div>
<img src="../../assets/01_ML_Concepts/LR_GD_2.png", width=500, height=100>
    </div>

<div>
<img src="../../assets/01_ML_Concepts/LR_GD_3.png", width=500, height=100>
    </div>

# Implementation

Let's implement Gradient Descent using the derived gradients

## Synthetic Data

In [18]:
nb_samples = 100 # n in the whiteboard
nb_features = 3 # d in the whiteboard

# generate three random vectors 
x_1 = np.random.normal(0, 0.1, size=nb_samples)
x_2 = np.random.normal(0, 0.3, size=nb_samples)
x_3 = np.random.normal(0, 0.2, size=nb_samples)

# take their linear combination according to vector theta to generate y
noise = np.random.normal(0, 0.01, size=nb_samples) # some gaussian noise
y = 0.3*x_1 + 0.2*x_2 + 0.1*x_3 + noise # number of parameters corresponidng to nb_features
y = y.reshape(-1, 1) # this is important for tf

# create the design matrix X
X = np.stack([x_1, x_2, x_3], axis=1)

# sanity check the dimensions of our matrices
print(f'X shape = {X.shape}, y shape = {y.shape}')

X shape = (100, 3), y shape = (100, 1)


## Numpy

In [19]:
def compute_gradient(y, theta, X):
    '''
    Implement the gradient as shown from the derivation
    '''
    gradient = (-y.T @ X) + (theta.T @ X.T @ X)
    return gradient.T


def compute_loss(y_pred, y_true):
    '''
    Compute the loss function
    '''
    return np.mean(np.square(y_true - y_pred))

In [20]:
# constants and parameters
learning_rate = 0.01
epochs = 500

# initialize theta
theta = np.random.normal(0, 1, size=nb_features).reshape(-1, 1)

for epoch in range(epochs):
    # compute gradients
    gradient = compute_gradient(y, theta, X)
    
    # update variables
    theta -= learning_rate * gradient
    
    # compute current loss
    y_pred = X @ theta
    loss = compute_loss(y_pred, y)
    if epoch % 100 == 0:
        print(f'Epoch = {epoch} loss = {loss:.2f}')
        
print(f'Final Theta: {theta.squeeze()}')

Epoch = 0 loss = 0.09
Epoch = 100 loss = 0.00
Epoch = 200 loss = 0.00
Epoch = 300 loss = 0.00
Epoch = 400 loss = 0.00
Final Theta: [0.32 0.2  0.09]


Nice!

## Tensorflow 2.0


In [21]:
def train_epoch(X, y, model, loss_function):
    '''
    One Training Epoch
    '''
    with tf.GradientTape() as tape:
        # apply the model
        y_pred = model.predict(X)
        
        # compute current loss
        current_loss = loss_function(y_pred, y, model.variables)
        
    # automatic differentiation step
    gradients = tape.gradient(current_loss, model.variables)

    # apply gradients
    grads_and_vars = zip(gradients, model.variables)
    optimizer.apply_gradients(grads_and_vars)

    return current_loss


def train(data, target, model, loss_function, epochs, verbose=True):
    '''
    Full training pipeline
    '''
    for epoch in range(epochs):
        current_loss = train_epoch(data, target, model, loss_function)
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, current loss {current_loss:.2f}')

In [29]:
# constants and parameters
learning_rate = 0.01
epochs = 500

# define the model
class LinearModel:
    
    def __init__(self, nb_features):
        self.nb_features = nb_features
        self._init_variables()
        
        
    def _init_variables(self):
        theta = tf.random.normal(shape=(self.nb_features, 1))
        self.theta = tf.Variable(theta, trainable=True, dtype=tf.float32)
        self.variables = [self.theta]
        
    
    def predict(self, X):
        theta = self.variables[0]
        return tf.matmul(X, theta)


# define the loss function
def unregularized_loss_function(y_pred, y_true, variables=None):
    squared_error = tf.square(y_true - y_pred)
    sum_squared_error = tf.reduce_sum(squared_error)
    return tf.reduce_mean(sum_squared_error)


# define the data
data = tf.constant(X, dtype=tf.float32)
target = tf.constant(y.reshape(-1, 1), dtype=tf.float32)


# initialize the model
model = LinearModel(nb_features)


# choose an optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)


# train
train(data, target, model, unregularized_loss_function, epochs, verbose=True)

# print results
theta_tensor = model.variables[0]
print(f'Final Theta: {theta_tensor.numpy().squeeze()}')

Epoch 0, current loss 51.57
Epoch 100, current loss 0.04
Epoch 200, current loss 0.01
Epoch 300, current loss 0.01
Epoch 400, current loss 0.01
Final Theta: [0.3 0.2 0.1]


# Loss Function Experiments

Now that we have a nice framework written in Tensorflow, let's run some expirements to get a feel of how different loss functions work. The main idea here is to introduce regularization with the $l_1$ and $l_2$ norms (lasso and ridge) and see how this choice impacts the results.

First let's show how the $l_1$ norm encourages sparse solutions and can work as a feature selector. I will create a bunch of useless features, then regularize my loss function with the $l_1$ norm to see them gone

In [30]:
nb_samples = 500
nb_features = 20 # three real and two useless

X = np.random.rand(nb_samples, nb_features)

noise = np.random.normal(0, 0.1, size=nb_samples)

true_theta = np.zeros(shape=(nb_features, 1))
true_theta[:10] = 0.5
true_theta[10:] = 0.001

y = X @ true_theta   #notice how little x_4 and x_5 contribute to y
y = y.reshape(-1, 1)

# sanity check the dimensions of our matrices
print(f'X shape = {X.shape}, y shape = {y.shape}')

X shape = (500, 20), y shape = (500, 1)


In [31]:
# constants and parameters
learning_rate = 0.0001
epochs = 700


# write the l1 regularized loss function
def lasso_loss_function(y_pred, y_true, variables, lmbda=1):
    theta = variables[0]
    return tf.reduce_mean(tf.reduce_sum(tf.square(y_pred - y_true))) + lmbda * tf.reduce_sum(tf.abs(theta))


# write the l2 regularized loss function
def ridge_loss_function(y_pred, y_true, variables, lmbda=1):
    theta = variables[0]
    return tf.reduce_mean(tf.reduce_sum(tf.square(y_pred - y_true))) + lmbda * tf.reduce_sum(tf.square(tf.abs(theta)))


# define the data
data = tf.constant(X, dtype=tf.float32)
target = tf.constant(y.reshape(-1, 1), dtype=tf.float32)


# initialize the model
model = LinearModel(nb_features)


# choose an optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

# train unregularized
model = LinearModel(nb_features)
train(data, target, model, unregularized_loss_function, epochs, verbose=True)
theta_unreg = model.variables[0]
print('====')

# train lasso
model = LinearModel(nb_features)
train(data, target, model, lasso_loss_function, epochs, verbose=True)
theta_lasso = model.variables[0]
print('====')

# train ridge
model = LinearModel(nb_features)
train(data, target, model, ridge_loss_function, epochs, verbose=True)
theta_ridge = model.variables[0]

Epoch 0, current loss 1704.86
Epoch 100, current loss 274.93
Epoch 200, current loss 51.32
Epoch 300, current loss 10.46
Epoch 400, current loss 2.30
Epoch 500, current loss 0.54
Epoch 600, current loss 0.13
====
Epoch 0, current loss 6573.56
Epoch 100, current loss 160.38
Epoch 200, current loss 37.41
Epoch 300, current loss 12.30
Epoch 400, current loss 6.72
Epoch 500, current loss 5.45
Epoch 600, current loss 5.13
====
Epoch 0, current loss 2463.77
Epoch 100, current loss 209.32
Epoch 200, current loss 42.64
Epoch 300, current loss 10.85
Epoch 400, current loss 4.34
Epoch 500, current loss 2.91
Epoch 600, current loss 2.58


In [32]:
# print results
theta_tensor = model.variables[0]
print(f'Final Theta (Unregularized): {theta_unreg.numpy().squeeze()}')
print(f'Final Theta (Lasso): {theta_lasso.numpy().squeeze()}')
print(f'Final Theta (Ridge): {theta_ridge.numpy().squeeze()}')

Final Theta (Unregularized): [ 0.49  0.51  0.5   0.49  0.48  0.49  0.5   0.51  0.5   0.49  0.    0.
  0.01  0.01  0.    0.01 -0.   -0.    0.01  0.01]
Final Theta (Lasso): [0.49 0.5  0.5  0.49 0.49 0.49 0.5  0.51 0.5  0.49 0.   0.01 0.02 0.
 0.01 0.   0.   0.   0.   0.  ]
Final Theta (Ridge): [0.5  0.49 0.5  0.48 0.49 0.49 0.49 0.48 0.48 0.5  0.01 0.   0.   0.
 0.02 0.01 0.   0.01 0.02 0.01]


As you can see, when we crank up the regularization parameter $\lambda$ to very large values, the lasso regularizer keeps the most important features and suppresses the less important ones