In [7]:
#* get autorelod for the notesbook
%load_ext autoreload
%autoreload 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
import numpy as np
from minitorch.tensor.tensor import Tensor
from minitorch.train.training import CosineSchedule, clip_grad_norm,Trainer
from minitorch.nn.layers import Linear
from minitorch.optimizers.optim import SGD
from minitorch.losses.losses import MSE
from minitorch.dataloaders.dataloader import TensorDataset,DataLoader

In [2]:
def unit_tests_cosine_scheduler():
    print('Unit Tests: Testing the cosine scheduler ....')
    scheduler = CosineSchedule()
    tolerance = 1e-6
    
    #* Test basic schedule
    lr_start = scheduler.get_lr(0)
    lr_quarter = scheduler.get_lr(25)
    lr_middle = scheduler.get_lr(50)
    lr_end = scheduler.get_lr(100)
    
    print(f"Learning rate at epoch 0: {lr_start:.4f}")
    print(f"Learning rate at epoch 25: {lr_middle:.4f}")
    print(f"Learning rate at epoch 50: {lr_middle:.4f}")
    print(f"Learning rate at epoch 100: {lr_end:.4f}")
    
    #* validate behavior
    assert abs(lr_start - 0.1) < tolerance, f'Expected 0.1 at start, got {lr_start}'
    assert abs(lr_end - 0.01) < tolerance, f'Expected 0.01 at the end, got {lr_end}'
    assert 0.01 < lr_middle < 0.1, f'Expected middle lr to be between 0.01 and 0.1, got {lr_middle}'
    
    #* monotonic test
    assert lr_quarter > lr_middle, 'Lr should decrease monotonically in first half'
    
    print('Cosine Scheduler works perfectly')
    
unit_tests_cosine_scheduler()
    

Unit Tests: Testing the cosine scheduler ....
Learning rate at epoch 0: 0.1000
Learning rate at epoch 25: 0.0550
Learning rate at epoch 50: 0.0550
Learning rate at epoch 100: 0.0100
Cosine Scheduler works perfectly


In [40]:
def unit_tests_clip_grad_norm():
    print('Unit Tests: Testing the clip_grad_norm function ....')
    x = Tensor(np.array([[2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [5.6,7.0, 11.9,12.0]], dtype='float32'), requires_grad=True)
    y = Tensor(np.array([1.0, 2.0, 3.0, 3.0, 4.0,5.0], dtype='float32'), requires_grad=True)
    
    
    x.grad = np.random.randint(x.shape[0], size=x.shape, dtype='int32').astype('float32')
    y.grad = np.random.randint(y.shape[0], size=y.shape, dtype='int32').astype('float32')
    
    #* clip manually to verify
    x_grad = np.sum(x.grad ** 2)
    y_grad = np.sum(y.grad ** 2)
    total_norm = np.sqrt(x_grad + y_grad)
    
    if total_norm > 1.0:
        clip_coef = 1.0 / (total_norm + 1e-6)
        x.grad *= clip_coef
        y.grad *= clip_coef
        
    #* now use the function to verify it does the same thing
    x_copy = x.copy()
    y_copy = y.copy()
    x_copy.grad = x.grad.copy()
    y_copy.grad = y.grad.copy()
    
    norm = clip_grad_norm([x_copy,y_copy], max_norm=1.0)
    
    
    #* compare manual clipping with function clipping
    assert np.allclose(x_copy.grad, x.grad), "x grad should be the same"
    assert np.allclose(y_copy.grad, y.grad), "y grad should be the same"
    # assert norm > 1.0, f"Total norm should be greater than 1.0 before clipping, got {norm:.4f}"
    print('clip_grad_norm works perfectly')
    
    
unit_tests_clip_grad_norm()

Unit Tests: Testing the clip_grad_norm function ....
clip_grad_norm works perfectly


In [82]:
x = Tensor(np.array([[2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [5.6,7.0, 11.9,12.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [5.6,7.0, 11.9,12.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [5.6,7.0, 11.9,12.0]],
                    dtype='float32'),
                    requires_grad=True)

y = Tensor(np.array([1.0, 2.0, 3.0, 4.0,5.0,1.0,
                    2.0, 3.0, 4.0,5.0,1.0, 2.0,
                    3.0, 4.0,5.0,1.0, 2.0, 3.0,
                    3.0, 4.0,5.0,1.0, 2.0, 3.0, 4.0,5.0,1.0,
                    2.0, 3.0, 4.0,5.0,1.0, 2.0,
                    3.0, 4.0,5.0,1.0, 2.0, 3.0,
                    3.0, 4.0,5.0,1.0, 2.0, 3.0, 4.0,5.0,1.0,
                    2.0, 3.0, 4.0,5.0,1.0, 2.0,
                    3.0, 4.0,5.0,1.0, 2.0, 3.0,
                    3.0, 4.0,5.0],
                    dtype='float32'),
                    requires_grad=True)

#* get the model, scheduler and the loss fn
model = Linear(in_features=x.shape[1], out_features=1)
scheduler = CosineSchedule()
loss_fn = MSE()
optimizer = SGD(model.parameters())

#* get the dataloader
ds = TensorDataset(x,y)
dataloader = DataLoader(dataset=ds, batch_size=2)

#* call the trainer class
train = Trainer(
    model = model,
    loss_fn= loss_fn,
    scheduler= scheduler,
    clip_gradients=True,
    optimizer= optimizer
)

#* train the model
for epoch in range(10000):
    loss = train.train_epoch(dataloader, 3)
    if (epoch +1) % 1000 == 0:
        print(f'Training Epoch: {epoch + 1}, loss: {loss}')

Training Epoch: 1000, loss: 1.560002081986237
Training Epoch: 2000, loss: 1.3832915037229556
Training Epoch: 3000, loss: 1.7599050579085003
Training Epoch: 4000, loss: 1.5163144629383793
Training Epoch: 5000, loss: 1.579355283036478
Training Epoch: 6000, loss: 1.467424816016217
Training Epoch: 7000, loss: 1.5350172343365485
Training Epoch: 8000, loss: 1.5638368632579684
Training Epoch: 9000, loss: 1.43937591998689
Training Epoch: 10000, loss: 1.6988214809914917


In [81]:
train.history['learning_rates']

[0.1,
 0.09997779521645793,
 0.09991120277927222,
 0.0998002884071386,
 0.09964516155915151,
 0.0994459753267812,
 0.09920292628279101,
 0.09891625428724364,
 0.09858624225078841,
 0.09821321585546244,
 0.09779754323328192,
 0.09733963460294015,
 0.09683994186497132,
 0.09629895815577916,
 0.09571721736097089,
 0.09509529358847656,
 0.09443380060197386,
 0.09373339121517747,
 0.09299475664759069,
 0.09221862584235528,
 0.09140576474687263,
 0.09055697555690607,
 0.08967309592491052,
 0.08875499813337069,
 0.08780358823396352,
 0.08681980515339464,
 0.08580461976679099,
 0.08475903393956434,
 0.08368407953869104,
 0.08258081741438394,
 0.0814503363531613,
 0.08029375200334589,
 0.07911220577405485,
 0.07790686370876672,
 0.07667891533457719,
 0.07542957248827961,
 0.07416006812042827,
 0.07287165507856512,
 0.07156560487081053,
 0.07024320641103812,
 0.06890576474687264,
 0.06755459977176533,
 0.06619104492241847,
 0.06481644586284442,
 0.06343215915635762,
 0.0620395509268104,
 0.06063

In [46]:
out,y

(Tensor(data=[[-1.71782212]
  [-1.82353539]
  [ 2.89883118]
  [ 1.89663584]
  [ 7.12683731]
  [-1.75512082]], shape=(6, 1), grad_info= requires_grad=True),
 Tensor(data=[1. 2. 3. 3. 4. 5.], shape=(6,), grad_info= requires_grad=True))