In [3]:
import pandas as pd
import numpy as np

# Stochastic gradient descent

In [50]:
def stochastic_gradient_descent(X, y, initial_solution, calculate_gradient, learning_rate=0.01, max_num_epoch=1000):
    """
    Performs stochastic gradient descent optimization.

    Parameters:
    - X: Input data.
    - y: Target labels.
    - initial_solution: Initial solution for optimization.
    - calculate_gradient: Function to calculate the gradient.
    - learning_rate: Learning rate for updating the solution (default: 0.01).
    - max_num_iters: Maximum number of iterations (default: 1000).

    Returns:
    - The optimized solution.
    """

    # initialization
    if type(X) is pd.DataFrame:
        X = X.to_numpy()
    if type(y) is pd.DataFrame:
        y = y.to_numpy().T
    current_solution = initial_solution 

    for _ in range(max_num_epoch):
        N, _ = X.shape
        shuffled_idx = np.random.permutation(N)
        X, y = X[shuffled_idx], y[shuffled_idx]
        for X_selected, y_selected in zip(X, y):
            gradient = calculate_gradient(X_selected, y_selected, current_solution)
            current_solution = current_solution - learning_rate * gradient
        print('Epoch:', current_solution)
    return current_solution

#TODO

1. implement stop condition - additional parameter loss_tolerance
2. using batches 
3. shuffling at the beginning of epoch 
4. iteration over shuffled array 
5. implement later stochastic GD as mini batch
6. mini batch 2 additional parameters batch size or frac_of initial data

In [55]:
#TEST
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
y = np.array([5, 13, 25, 39, 63])
initial_solution = np.array([10, -10])
calculate_gradient = lambda X, y, w: X
print(stochastic_gradient_descent(X, y, initial_solution, calculate_gradient, max_num_epoch=100, learning_rate=0.001)) # Expected output: [0.9999999999999999, 1.9999999999999998]

Gradient: [3 4]
Gradient: [1 2]
Gradient: [4 5]
Gradient: [2 3]
Gradient: [5 6]
Epoch: [  9.985 -10.02 ]
Gradient: [3 4]
Gradient: [5 6]
Gradient: [2 3]
Gradient: [4 5]
Gradient: [1 2]
Epoch: [  9.97 -10.04]
Gradient: [2 3]
Gradient: [3 4]
Gradient: [1 2]
Gradient: [5 6]
Gradient: [4 5]
Epoch: [  9.955 -10.06 ]
Gradient: [5 6]
Gradient: [1 2]
Gradient: [4 5]
Gradient: [3 4]
Gradient: [2 3]
Epoch: [  9.94 -10.08]
Gradient: [4 5]
Gradient: [1 2]
Gradient: [3 4]
Gradient: [5 6]
Gradient: [2 3]
Epoch: [  9.925 -10.1  ]
Gradient: [1 2]
Gradient: [3 4]
Gradient: [5 6]
Gradient: [4 5]
Gradient: [2 3]
Epoch: [  9.91 -10.12]
Gradient: [5 6]
Gradient: [2 3]
Gradient: [1 2]
Gradient: [4 5]
Gradient: [3 4]
Epoch: [  9.895 -10.14 ]
Gradient: [5 6]
Gradient: [3 4]
Gradient: [4 5]
Gradient: [1 2]
Gradient: [2 3]
Epoch: [  9.88 -10.16]
Gradient: [3 4]
Gradient: [2 3]
Gradient: [5 6]
Gradient: [1 2]
Gradient: [4 5]
Epoch: [  9.865 -10.18 ]
Gradient: [1 2]
Gradient: [2 3]
Gradient: [3 4]
Gradient: [4 5]

# Mini batch gradient descent

# Mini batch gradient descent with momentum 

# AdaGrad

# RMSProp

# Adam