In [8]:
#* get autorelod for the notesbook
%load_ext autoreload
%autoreload 3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import numpy as np
from minitorch.tensor.tensor import Tensor
from minitorch.train.training import CosineSchedule, clip_grad_norm,Trainer
from minitorch.nn.layers import Linear
from minitorch.optimizers.optim import SGD, Adam, AdamW
from minitorch.losses.losses import MSE
from minitorch.dataloaders.dataloader import TensorDataset,DataLoader

In [10]:
def unit_tests_cosine_scheduler():
    print('Unit Tests: Testing the cosine scheduler ....')
    scheduler = CosineSchedule()
    tolerance = 1e-6
    
    #* Test basic schedule
    lr_start = scheduler.get_lr(0)
    lr_quarter = scheduler.get_lr(25)
    lr_middle = scheduler.get_lr(50)
    lr_end = scheduler.get_lr(100)
    
    print(f"Learning rate at epoch 0: {lr_start:.4f}")
    print(f"Learning rate at epoch 25: {lr_middle:.4f}")
    print(f"Learning rate at epoch 50: {lr_middle:.4f}")
    print(f"Learning rate at epoch 100: {lr_end:.4f}")
    
    #* validate behavior
    assert abs(lr_start - 0.1) < tolerance, f'Expected 0.1 at start, got {lr_start}'
    assert abs(lr_end - 0.01) < tolerance, f'Expected 0.01 at the end, got {lr_end}'
    assert 0.01 < lr_middle < 0.1, f'Expected middle lr to be between 0.01 and 0.1, got {lr_middle}'
    
    #* monotonic test
    assert lr_quarter > lr_middle, 'Lr should decrease monotonically in first half'
    
    print('Cosine Scheduler works perfectly')
    
unit_tests_cosine_scheduler()
    

Unit Tests: Testing the cosine scheduler ....
Learning rate at epoch 0: 0.1000
Learning rate at epoch 25: 0.0550
Learning rate at epoch 50: 0.0550
Learning rate at epoch 100: 0.0100
Cosine Scheduler works perfectly


In [11]:
def unit_tests_clip_grad_norm():
    print('Unit Tests: Testing the clip_grad_norm function ....')
    x = Tensor(np.array([[2.0, 3.0, 4.6,7.0],
                        [4.0,5.0,8.0,10.0],
                        [5.6,7.0, 11.1,1.0],
                        [2.0, 3.0,0.0,-1.0],
                        [4.0,5.0,-2.0, -10.0],
                        [5.6,7.0, 11.9,12.0]], dtype='float32'), requires_grad=True)
    y = Tensor(np.array([1.0, 2.0, 3.0, 3.0, 4.0,5.0], dtype='float32'), requires_grad=True)
    
    
    x.grad = np.random.randint(x.shape[0], size=x.shape, dtype='int32').astype('float32')
    y.grad = np.random.randint(y.shape[0], size=y.shape, dtype='int32').astype('float32')
    
    #* clip manually to verify
    x_grad = np.sum(x.grad ** 2)
    y_grad = np.sum(y.grad ** 2)
    total_norm = np.sqrt(x_grad + y_grad)
    
    if total_norm > 1.0:
        clip_coef = 1.0 / (total_norm + 1e-6)
        x.grad *= clip_coef
        y.grad *= clip_coef
        
    #* now use the function to verify it does the same thing
    x_copy = x.copy()
    y_copy = y.copy()
    x_copy.grad = x.grad.copy()
    y_copy.grad = y.grad.copy()
    
    norm = clip_grad_norm([x_copy,y_copy], max_norm=1.0)
    
    
    #* compare manual clipping with function clipping
    assert np.allclose(x_copy.grad, x.grad), "x grad should be the same"
    assert np.allclose(y_copy.grad, y.grad), "y grad should be the same"
    # assert norm > 1.0, f"Total norm should be greater than 1.0 before clipping, got {norm:.4f}"
    print('clip_grad_norm works perfectly')
    
    
unit_tests_clip_grad_norm()

Unit Tests: Testing the clip_grad_norm function ....
clip_grad_norm works perfectly


In [102]:
x = np.random.rand(2, 4).astype('float32')
w = np.random.rand(4, 1).astype('float32')
b =  np.random.rand(1).astype('float32')
x @ w + b

array([[1.0929275],
       [1.5605981]], dtype=float32)

In [91]:
def generate_linear_data(num_samples=500, in_features=4, out_features=1):
    np.random.seed(42)
    
    X = np.random.rand(num_samples, in_features).astype('float32')
    true_weights = np.random.rand(in_features, out_features).astype('float32')
    true_bias = np.random.rand(out_features).astype('float32')
    y = X @ true_weights + true_bias 
    noise = np.random.normal(0, 0.1, size=(num_samples, out_features)).astype('float32') 
    y += noise
    
    #* convert to tensors
    X = Tensor(X, requires_grad=False)
    y = Tensor(y, requires_grad=False) 
    return X, y

X,y = generate_linear_data()

In [92]:
X.shape, y.shape

((500, 4), (500, 1))

In [105]:
#* define the hyperparameters
MAX_EPOCHS = 1000
MOMENTUM = 0.0
WEIGHT_DECAY = 0.0
MAX_LR = 0.01
MIN_LR = 0.001
BATCH_SIZE = 32
STEPS = MAX_EPOCHS / 10
ACCUMULATION_STEPS = 1

def train_model(Optimizerclass,
                optimizer_name,
                max_iters=200,
                lr=0.05,
                num_samples=500,
                in_features=4,
                out_features=1):
    print(f'\nTraining with {optimizer_name}')
    STEPS = max_iters / 10
    
    # get the data
    X, y = generate_linear_data(num_samples= num_samples, in_features=in_features)
    
    # instantiate the model and the optimizer
    model = Linear(in_features= in_features, out_features= out_features)
    optimizer = Optimizerclass(model.parameters(), lr=lr)
    loss_fn = MSE()
    
    #* get the dataloader
    ds = TensorDataset(X,y)
    dataloader = DataLoader(dataset=ds, batch_size=8, shuffle=True)
    
    #* get the trainer
    train = Trainer(
        model=model,
        optimizer= optimizer,
        loss_fn= loss_fn,
        clip_gradients= True
    )
    
    # training loop
    for iteration in range(max_iters):
        loss = train.train_epoch(dataloader)
        
        if (iteration + 1) % STEPS == 0:
            print(f'Training Epoch: {iteration + 1} | loss: {loss}')
            

In [107]:
train_model(SGD,optimizer_name='SGD Optimizer', lr= 0.01, max_iters=20)


Training with SGD Optimizer
Training Epoch: 2 | loss: 0.16034837803546995
Training Epoch: 4 | loss: 0.04895071983956229
Training Epoch: 6 | loss: 0.03679352021414672
Training Epoch: 8 | loss: 0.029322069325587254
Training Epoch: 10 | loss: 0.02453618439956383
Training Epoch: 12 | loss: 0.02144237129418736
Training Epoch: 14 | loss: 0.01977832196577257
Training Epoch: 16 | loss: 0.01804137023251128
Training Epoch: 18 | loss: 0.017512256983210718
Training Epoch: 20 | loss: 0.01684939786655444


In [88]:
state

{'optimizer_state': {}, 'scheduler_state': None}