In [81]:
%load_ext autoreload
%autoreload 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[autoreload of minitorch.optimizers.optim failed: Traceback (most recent call last):
  File "c:\Users\User\Desktop\babytorch\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 274, in check
    superreload(m, reload, self.old_objects, self.shell)
  File "c:\Users\User\Desktop\babytorch\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "c:\Users\User\Desktop\babytorch\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 397, in update_generic
    update(a, b)
  File "c:\Users\User\Desktop\babytorch\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 330, in update_class
    old_obj = getattr(old, key)
AttributeError: 'types.GenericAlias' object has no attribute '__copy__'. Did you mean: '__bool__'?
]


In [82]:
import numpy as np
from minitorch.tensor.tensor import Tensor
from minitorch.optimizers.optim import SGD
from minitorch.nn.layers import Linear
from minitorch.losses.losses import MSE

In [83]:
def sgd_unit_test():
    """Test SGD optimizer implemetation"""
    print('Unit Test: SGD Optimizer ....')
    
    #* basic optimizer test
    param = Tensor(np.array([1.0,2.0], dtype=np.float32), requires_grad=True)
    optimizer = SGD([param], lr=0.1)
    param.grad = Tensor(np.array([2.0, 1.0], dtype=np.float32))
    original_data = param.data.copy()
    grad = param.grad.data
    optimizer.step()
    
    expected = original_data - optimizer.learning_rate * grad
    print(expected, param)
    assert np.allclose(expected, param.data)
    assert optimizer.step_count == 1
    print('Basic SGD optimizer works correctly')
    
    # optimizer with momentum test
    param2 = Tensor(np.array([1.0, 2.0]), requires_grad=True)
    optimizer_momentum = SGD([param2], lr=0.1, momentum=0.9)
    param2.grad = Tensor(np.array([2.0, 1.0]))
    original_data = param2.data.copy()
    grad = param2.grad.data
    optimizer_momentum.step()
    
    expected = original_data - optimizer_momentum.learning_rate * grad
    assert np.allclose(expected.data, param2.data)
    assert optimizer_momentum.step_count == 1, f'step count expected to be 1 got {optimizer_momentum.step_count}'
    print('SGD Oprimizer with momentum works correctly ')
    
    # test weight decay
    param3 = Tensor(np.array([1.0, 2.0]), requires_grad=True)
    optimizer_weight_decay = SGD([param3], weight_decay=0.1)
    param3.grad = Tensor(np.array([3.0, 4.0]))
    
    optimizer_weight_decay.step()

    
    expected = param3.data - optimizer_weight_decay.learning_rate * (param3.grad.data + optimizer_weight_decay.weight_decay * param3.data)
    assert np.allclose(expected, param3.data, rtol=0.05)
    print('SGD Optimizer with weight decay works correctly')
    
    print("SGD optimizer works correctly!")

    
if __name__ == '__main__':
    sgd_unit_test()

Unit Test: SGD Optimizer ....
[0.8 1.9] Tensor(data=[0.8 1.9])
Basic SGD optimizer works correctly
SGD Oprimizer with momentum works correctly 
SGD Optimizer with weight decay works correctly
SGD optimizer works correctly!


In [84]:
x = Tensor(np.array([[2.0, 3.0, 4.6,7.0],
                    [4.0,5.0,8.0,10.0],
                    [5.6,7.0, 11.1,1.0],
                    [2.0, 3.0,0.0,-1.0],
                    [4.0,5.0,-2.0, -10.0],
                    [5.6,7.0, 11.9,12.0]]), requires_grad=True)
y = Tensor(np.array([1.0, 2.0, 3.0, 3.0, 4.0,5.0]), requires_grad=True)

weight = Tensor(np.array([0.1, 0.2, 0.3,0.4]), requires_grad=True)
bias = Tensor(np.array([0.0,0.0,0.0,0.0,0.0,0.0]), requires_grad=True)

loss_fn = MSE()
optimizer = SGD([weight, bias],lr=0.001, momentum=0.9, weight_decay=0.01)

for i in range(100):
    y_hat = x @ weight.transpose() + bias
    loss = loss_fn(y_hat, y)
    if i % 10 ==0:
        print(f'Iteration {i}, loss {loss.data}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Iteration 0, loss 23.954233333333338
Iteration 10, loss 5.652310933047676
Iteration 20, loss 0.7633906755512504
Iteration 30, loss 1.008738277237916
Iteration 40, loss 0.7229959892219938
Iteration 50, loss 0.46474383421076865
Iteration 60, loss 0.4137194474611481
Iteration 70, loss 0.3795707359642195
Iteration 80, loss 0.34471157095081856
Iteration 90, loss 0.3204017834229631


In [85]:
linear = Linear(x.shape[1], y.size, bias=True)
loss_fn = MSE()
optimizer = SGD(linear.parameters(), lr=0.001, momentum=0.0, weight_decay=0.0)
# y_hat = x @ weight + bias

for i in range(100):
    y_hat = linear(x)
    loss = loss_fn(y_hat, y)
    if i % 10 == 0:
        print(f'Iteration {i}, Loss: {loss.data}')
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Iteration 0, Loss: 87.98768337839843
Iteration 10, Loss: 47.13314110006033
Iteration 20, Loss: 28.82966828586299
Iteration 30, Loss: 19.901986869317334
Iteration 40, Loss: 15.023126775449306
Iteration 50, Loss: 12.009344011760586
Iteration 60, Loss: 9.94159845000162
Iteration 70, Loss: 8.414411566179476
Iteration 80, Loss: 7.2346019998700655
Iteration 90, Loss: 6.299720787474553


In [86]:
y_hat = linear(x)
y_hat, y

(Tensor(data=[[ 0.99982871  1.45375876  1.47368904  1.61246175  2.19824459  2.04353783]
  [ 0.98780671  2.7617519   2.36157125  2.72688993  3.39094325  3.65937865]
  [-0.09930527 -0.0367062   2.43131109  2.80896348  2.901384    6.77009805]
  [-1.16326663  1.37666317 -0.79306292  1.16934284 -0.34530165  1.15323477]
  [-4.00648175  1.80063086 -2.73407138  1.56039148 -2.50270349  2.06285023]
  [ 1.35207708  2.89950754  3.44403773  3.63446804  4.68520392  5.67660373]], shape=(6, 6), grad_info= requires_grad=True),
 Tensor(data=[1. 2. 3. 3. 4. 5.], shape=(6,), grad_info= requires_grad=True))