In [1]:
import torch
import numpy as np
import random
from torch import nn,tensor
import matplotlib.pyplot as plt
from datasets import load_dataset
from torchmetrics.classification import MulticlassAccuracy 
import torchvision.transforms.functional as TF

import fastcore.all as fc

from lib import *


KeyboardInterrupt



In [None]:
def set_seed(seed):
    torch.use_deterministic_algorithms(True)
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    
    
@inplace
def transformi(b):
    b[x] = [TF.to_tensor(o) for o in b[x]]
    

class SingleBatchCB(Callback):
    order = 1
    def after_batch(self, learn):
        raise CancelFitException

In [None]:
def conv(ni, nf, ks=3, stride=2, act=nn.ReLU, norm=None, bias=None):
    if bias is None: # add bias if norm is not a BatchNormLayer
        bias = not isinstance(norm, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
    layers = [nn.Conv2d(ni, nf, stride=stride, kernel_size=ks, padding=ks//2, bias=bias)]
    if norm: 
        layers.append(norm(nf))
    if act: 
        layers.append(act())
    return nn.Sequential(*layers)
# conv->LayerNorm->act

In [None]:
def get_model(act=nn.ReLU, nfs=None, norm=None):
    # nfs layer dims chain
    if nfs is None:
        nfs = [1, 8, 16, 32, 64]
    layers = [conv(nfs[i], nfs[i+1], act=act, norm=norm) for i in range(len(nfs)-1)]
    return nn.Sequential(*layers, 
                         conv(nfs[-1], 10, act=None, norm=False, bias = True),
                         nn.Flatten()).to(device)             

In [None]:
device = "mps" if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
set_seed(42)
metrics = MetricCB(MulticlassAccuracy(10))

In [None]:
x,y = 'image','label'
dsd = load_dataset("fashion_mnist")
bs = 1024
tds = dsd.with_transform(transformi)
dls = DataLoaders.from_datasetDict(tds, bs)
dt = dls.train
xb,yb = next(iter(dt))
xb.shape,yb[:10]

In [None]:
xmean, xstd = xb.mean(), xb.std()
xmean, xstd # no normally distributed!

def normalize(b):
    return (b[0]-xmean)/xstd, b[1]

norm = BatchTransformCB(normalize) 

In [None]:
leak = 0.1
general_leaky_relu = partial(GeneralRelu, leak=leak, subtract=0.4)
astats = ActivationStats(append_stats, fc.risinstance(GeneralRelu)) # get only GeneralRelu # ,
cbs = [DeviceCB(), ProgressCB(plot=True), metrics, astats, norm] 
f_init_weights = partial(init_weights, leaky=leak)

In [None]:
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = MomentumLearner(model, dls, F.cross_entropy, lr=0.4, cbs=cbs)

In [None]:
learn.fit(3)

In [None]:
astats.plot_all()

# SGD
Now we implement SGD from scratch

In [None]:
model.parameters()

In [None]:
class SGD:
    def __init__(self, params, lr, weight_decay=0.):
        '''
        params: generator of model params
        '''
        params = list(params) # pytorch put params in groups divided in the generator (not sure about generator/group)
        fc.store_attr()
        self.batch_number = 0 # required/used to build Adam
        
    def step(self):
        '''
        first regularizer step
        then opt step
        this works only for sgd
        '''
        with torch.no_grad():
            for p in self.params:
                self.regularizer_step(p)
                self.opt_step(p)
        self.batch_number += 1
        
    def opt_step(self, p):
        # directly modify the weights instead of mofifying the grad
        p -= p.grad * self.lr 
        
    def regularizer_step(self, p):
        '''
        L2 regularization: adds the square of the weights to the loss
        -> tries to minimize the abs val of the weights
        small weights = more generalization
        # IMPO! also this update uses lr!
        ''' 
        if self.weight_decay:
            '''
            p' = p - wd*lr*p
            p' = p * (1-wd*lr)
            '''
            p *= 1 - (self.lr * self.weight_decay)
    
    def zero_grad(self):
        for p in self.params:
            # if you use .data is just a using with .no_grad() 
            p.grad.data.zero_()

Consider the difference between weight decay and L2 regularization:
    
    weights -= lr*wd*weights
    
vs

    weights += wd*weights

L2 regularization penalizes large weights to prevent overfitting. 

2 approaches (the first was in my mind and wrong):

> params -= wd * lr * params (Commonly used in some implementations):

Adjusts the weights (params) by subtracting a fraction (wd * lr) of the weights themselves. It scales the amount of wd by the lr -> the larger the lr or wd, the larger the adjustment to the weights. This combined effect helps to regulate the magnitude of the weights during optimization/more effective regularization.

> params -= wd * params (Alternative formulation without considering the learning rate):

Without considering the learning rate, it directly penalizes the weights by subtracting a fraction of the weights themselves. However, it doesn't account for the scale at which these adjustments are made based on the learning rate.

It allows for finer control over how much the weights are penalized in each update step based on the learning rate.

By including the learning rate in the weight decay calculation, it provides a mechanism to scale the regularization effect relative to the step size taken during gradient descent. This combined formulation helps prevent the model from overfitting by appropriately penalizing large weights while ensuring a stable and effective optimization process.

In [None]:
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=0.4, cbs=cbs, opt_func=SGD)

In [None]:
learn.fit(3)

# Momentum
Momentum should be in the optimizer, not as we did before 
It allows you to follow the average of the directions in sgds steps in the loss func surface. The higher the mom the slower it reacts to changes of direction

In [None]:
class Momentum(SGD):
    '''
    Stores moving avg of the gradient and 
    Idea: take only a small fraction of the current batch gradient
    keep going toward what the grad avg has stored over time 
    it smooths out the trajectory instead of being bumpy at each batch
    it smoothly follows the general trend.
    Too small = too bumpy
    Too high = not reactive to changes, late response to surface changes
        acts wrt info of many batches ago
    The right value leads to a smoother, faster, stabler convergence
    
    '''
    def __init__(self, params, lr, wd=0., mom=0.9):
        super().__init__(params, lr, wd)
        self.mom = mom
    
    def one_step(self): # customizes torch dtypes cool
        if not hasattr(p, "grad_avg"): # creates a new dmember for tensor class
            p.grad_avg = torch.zeros_like(p.grad)
        p.grad_avg = p.grad_avg*self.mom + p.grad * (1 - self.mom)
        # what was the trend + this batch grad
        p -= self.lr * p.grad_avg    

In [None]:
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=1.5, cbs=cbs, opt_func=Momentum)

In [None]:
learn.fit(3)

In [None]:
astats.plot_all()

# - huge bump in lr cuz momentum allows us to follow in a smoother way the loss surface

# - you want a small batch size cuz more opportunity to update 

# - momentum can be too aggressive for complex architectures, rmsprop better for complex architectures (this means also Adam is bad for complex architectures since also Adam uses momentum)

# RMSProp

In [None]:
class RMSProp(SGD):
    '''
    Same idea of momentum but with the addition as:
    if thre grad has been varying a log -> uncertainity over direction
    -> we should do small steps/updates
    -> if low variation -> confident about direction -> go faster
    
    p.grad**2 -> measure of spread of the gradient for each weight
    -> its moving avg thus defines the volatility of a a single param grad
    -> taking its .sqrt() we "bring back variance to data scale" 
        (grad scale in this case)
    -> step = (grad of current batch) / (grad volatility) 
        -> if volatility high : step = small -> move slower
        -> if volatility low : step = high -> move faster
    '''
    def __init__(self, params, lr, wd=0., mom=0.99, eps=1e-5):
        super().__init__(params, lr, wd)
        self.mom, self.eps = mom, eps
        
    def opt_step(self, p):
        if not hasattr(p, "square_avg"):            
            p.square_avg = p.grad**2 # non 0 init cuz this goes to denominator 
                # and it would lead to first updates very large
            
        p.square_avg = p.square_avg*self.mom + p.grad**2 * (1 - self.mom)
        # divide the grad by the ammount of variation 
        p -= self.lr * p.grad / (p.square_avg.sqrt() + self.eps)
        # the denominator is possibly a small number 
        # -> we need to decrease lr cuz 
        # p.grad / (p.square_avg.sqrt() + self.eps) is possibly very large
        # and if lr too big we would be doing too large steps -> divergence

In [None]:
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=cbs, opt_func=RMSProp)

In [None]:
learn.fit(3)

In [None]:
astats.plot_all()

# ADAM
RMSProp with momentum combined

In [None]:
class Adam(SGD):
    def __init__(self, params, lr, wd=0., beta1=0.9, beta2=0.99, eps=1e-5):
        '''
        beta1 = momentum
        beta2 = momentum of squared grad (momentum from RMSProp)
        '''
        super().__init__(params, lr, wd)
        self.beta1,self.beta2,self.eps = beta1,beta2,eps

    def opt_step(self, p):
        if not hasattr(p, 'avg'): 
            p.avg = torch.zeros_like(p.grad.data)
        if not hasattr(p, 'sqr_avg'): 
            p.sqr_avg = torch.zeros_like(p.grad.data)
        
        # apply momentum to g.avg
        p.avg = self.beta1*p.avg + (1-self.beta1)*p.grad
        unbias_avg = p.avg / (1 - (self.beta1**(self.batch_number+1)))
        
        # apply momentum to g.squared 
        p.sqr_avg = self.beta2*p.sqr_avg + (1-self.beta2)*(p.grad**2)
        unbias_sqr_avg = p.sqr_avg / (1 - (self.beta2**(self.batch_number+1)))
        
        # normalize wrt grad variance
        p -= self.lr * unbias_avg / (unbias_sqr_avg + self.eps).sqrt()

In [None]:
set_seed(2)
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=cbs, opt_func=Adam)

In [None]:
learn.fit(3)

In [None]:
astats.plot_all()

# Let's look at the content of a whole pytorch module: lr_scheduler

In [None]:
torch.optim.lr_scheduler. # + hit tab

In [None]:
torch.optim.lr_scheduler

In [None]:
dir(torch.optim.lr_scheduler)

In [None]:
# all things in torch.optim.lr_scheduler: valid attributes and methods belonging to an object
' '.join(o for o in dir(torch.optim.lr_scheduler) if o[0].isupper() and o[1].islower())

In [None]:
from IPython.display import Image
Image(filename="./schedulers.jpg")

These schedulers work with pytorch optimizers, so we have to use those since their API is a little different from the optimizers that we implemented.

# Pytorch optimizers API

In [None]:
# get optimizer by instanciating the learner using pyt.optim.sgd
set_seed(2)
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=[SingleBatchCB()])
learn.fit(1)

In [None]:
optimizer = learn.opt # fit model for 1 batch s.t. just to get its opt 
' '.join(o for o in dir(optimizer) if o[0]!='_') # amazing

In [None]:
attributes = [attr for attr in dir(optimizer) if not attr.startswith('__')]

# Differentiating between methods and attributes
methods = [attr for attr in attributes if callable(getattr(optimizer, attr)) and not attr.startswith('_')]
attributes = [attr for attr in attributes if not callable(getattr(optimizer, attr)) and not attr.startswith('_')]

print("optimizer Attributes:", attributes)
print("\n")
print("optimizer Methods:", methods)

In [None]:
def print_obj_API(obj):
    attributes = [attr for attr in dir(obj) if not attr.startswith('__')]

    # Differentiating between methods and attributes
    methods = [attr for attr in attributes if callable(getattr(obj, attr)) and not attr.startswith('_')]
    attributes = [attr for attr in attributes if not callable(getattr(obj, attr)) and not attr.startswith('_')]
    
    print(f"{type(obj).__name__} Attributes:", attributes)
    print("\n")
    print(f"{type(obj).__name__} Methods:", methods)

In [None]:
optimizer

It contains "parameters groups", in our case it contains only 1 group that is composed by the all params of our model.
Let's see the params groups.

In [None]:
param = next(iter(learn.model.parameters())) # get param
state = optimizer.state[param] # get state of the param 
# the state of the param is contained in a dict k:param_vect, v:state

Now let's see this weird thing: a dict with as keys -> parameter tensors!

In [None]:
state # sgd momentum buffer

State is a dictionary that stores info related to a tensor eg here it is shown the storage for moving average used for momentum. It's just as class data member but pyt in this case works with this dict approach.

Optimizers handle parameters as parameter groups cuz u can change lr of particular groups ad-hoc.

In [None]:
print("number of param groups:", len(optimizer.param_groups))
pg = optimizer.param_groups[0] # the retrieved obj is a dict

we have only 1 param group

In [None]:
pg.keys() 

In [None]:
list(pg) #<- same as keys()

# Schedulers
We have already implemented a scheduler, lecture 8a so now we load pytorch ones and test them. Schedulers are able to change lr of an optimizer

In [None]:
import math

In [None]:
def getLr(lr_start, steps, i):
    return lr_start/2 * (1 + math.cos(i*math.pi/steps))


steps = 100
init_lr = 0.2
list_lrs = [init_lr]
step = partial(getLr, init_lr, steps)

for i in range(steps):
    list_lrs.append(step(i))
    
plt.plot(list_lrs)


In [None]:
class CosAnnealerScheduler():
    
    def __init__(self, optimizer, max_steps):
        self.optimizer = optimizer
        self.max_steps = max_steps
        self.step_couter = 0
    
    def 
    
    
    def step():
        pass

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingLR
sched = CosineAnnealingLR(optimizer, 100)

In [None]:
sched.base_lrs # got from the optimizer
#starting lr, list cuz different for each group

In [None]:
print_obj_API(sched)

In [None]:
sched.get_last_lr()

In [None]:
# let's visualize the annealing of the lr
def sched_lrs(sched, steps):
    lrs = [sched.get_last_lr()]
    for i in range(steps):
        sched.optimizer.step()
        sched.step()
        lrs.append(sched.get_last_lr())
    print("last 5 lrs: ", lrs[-5:])
    plt.plot(lrs)
    
sched_lrs(sched, 110) # goes up after 100 steps cuz cosine curve

CosineAnnealingLR is a scheduling technique that starts with a very large learning rate and then aggressively decreases it to a value near 0 before increasing the learning rate again.

Each time the “restart” occurs, we take the good weights from the previous “cycle” as the starting point. Thus, with each restart, the algorithm approaches the minimal loss closer.

In [None]:
sched.get_last_lr()

# Scheduler CB

In [None]:
class BaseSchedCB(Callback):
    '''
    scheduler: partial with ctor of the desired scheduler with T_max
    
    eg: BaseSchedCB(partial(CosineAnnealingLR, T_max=val))
    
    the scheduler is an handler of the optimizer
    the optimizer is agnostic of the scheduler
    '''
    
    def __init__(self, scheduler):
        self.scheduler = scheduler
        
    def before_fit(self, learn):
        # before fit sets gets learner optimizers and uses it to set up scheduler
        self.scheduler_optimizer = self.scheduler(learn.opt)
    
    def step(self, learn):
        if learn.training: 
            self.scheduler_optimizer.step()

In [None]:
class BatchSchedCB(BaseSchedCB):
    def after_batch(self, learn): self.step(learn)

Since BatchSchedCB and EpochSchedCB  do not have their own \_\_init_\_ they implicitly call the \_\_init_\_() of their superclass forwarding the param (scheduler) from instanciation of derived to ctor call of base class. \
On the other hand if you provide your own \_\_init_\_() method you __MUST__ call super().\_\_init_\_() with the right args.

To see what the scheduler is doing we need access to the inside of the learner. 
In particular we need eg to record something 

In [None]:
# test
def f(**a):
    print(a) # outs dict

    t = list(a) # outs keys as list
    print(t)
    
    for k, v in a.items():
        print(k, v)
        
f(asd="asd", foo="foo", lr=0.5)

In [None]:
class RecorderCB(Callback):
    '''
    it takes as input a  keyworded list of args where each value is a func 
    that 
    
    keyward = thing that we want to record
    func = function to grab the keyward
    '''
    def __init__(self, **d): # d is a keyworded list of args
        self.d = d
        
    def before_fit(self, learn):
        self.recs = {k:[] for k in self.d}
        # here we record only stuff related to the first param group
        self.param_group = learn.opt.param_groups[0] 
        
    def after_batch(self, learn):
        if not learn.training:
            return
        
        for k, v in self.d.items():
            self.recs[k].append(v(self))
            
    def plot(self):
        for k, v in self.recs.items():
            plt.plot(v, label=k)
            plt.legend()
            plt.show()
        

In [None]:
def _lr(cb):
    return cb.param_group["lr"] # cb is the instance of RecorderCB

We need to tell the scheduler the max ammount of opt.step() we are going to perform. So we take n_epochs * number_of_mini_batches_in_train_data.

In [None]:
n_epochs = 3
tmax = n_epochs * len(dls.train)
print(tmax)
scheduler = partial(CosineAnnealingLR, T_max=tmax)

In [None]:
set_seed(2)
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)

rec = RecorderCB(lr=_lr)
batchSched = BatchSchedCB(scheduler)

cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats, batchSched, rec]

learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=cbs, opt_func=optim.Adam)
learn.fit(3)

In [None]:
rec.plot()

In [None]:
pg = learn.opt.param_groups[0]
pg['lr']

In [None]:
batchSched.scheduler_optimizer.get_last_lr()

In [None]:
class EpochSchedCB(BaseSchedCB):
    def __init__(self, scheduler, print_lr=False):
        super().__init__(scheduler)
        self.print_lr=print_lr
        
    def after_epoch(self, learn): 
        self.step(learn)
        if self.print_lr:
            print(self.scheduler_optimizer.get_last_lr())

In [None]:
n_epochs = 3
scheduler = partial(CosineAnnealingLR, T_max=n_epochs)

In [None]:
set_seed(2)
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)

rec = RecorderCB(lr=_lr)
epochSched = EpochSchedCB(scheduler, True)
cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats, epochSched, rec]

learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=cbs, opt_func=optim.Adam)
learn.fit(n_epochs)

In [None]:
rec.plot()

In [None]:
epochSched.scheduler_optimizer.get_last_lr()

In [None]:
pg = learn.opt.param_groups[0]
pg['lr']

In [None]:
rec.recs

## CosineAnnealingLR visualization test

In [None]:
set_seed(2)
model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)
learn = TrainLearner(model, dls, F.cross_entropy, lr=3e-3, cbs=[SingleBatchCB()])
learn.fit(1)
optimizer = learn.opt # fit model for 1 batch s.t. just to get its opt 
sched = CosineAnnealingLR(optimizer, 3)
sched_lrs(sched, 3) 
sched.get_last_lr()

CosineAnnealingLR takes a T_max. This T_max is the ammount of steps that the scheduler knows that have to be done. 
Therefore CosineAnnealingLR also has an implicit "range" (even if it is implemented in a recursive manner): from lr_start to lr=0 it have to occurr T_max steps -> once the training is over the scheduler does "last step" and sets lr=0.0 cuz annealer is exausted.

# 1-cycle training

In [None]:
def _beta1(cb):  # adams momentum
    return cb.param_group['betas'][0]

In [None]:
rec = RecorderCB(lr=_lr, mom=_beta1) 
# 2 things are being tracked -> 2 things will be plotted at rec.plot() 

## Idea behind the OneCycleLR:
lr starts low that raises and goes back down
lr must start low cuz in the beginning we don't have a perfectly initialized model, so low learning rate allows us to start training without "derailing" off track.
While the lr is low, the core push will come from momentum -> if the weights keep moving in the same dir, even if lr is low, momentum will push us toward the right "underlying" direction.
Then when we get to the "righ part" of the area of the weight space, we can use high lr, but when we have high lr we must decrease/have very low momentum cuz otherwise we would jump around too much.
Then when we are close to convergence we need to reduce the lr for fine tuning.


while the momentum starts high, then drops to then raise again

In [None]:
set_seed(42)
lr = 6e-2
n_epochs = 5
tmax = n_epochs * len(dls.train)

from torch.optim.lr_scheduler import OneCycleLR
scheduler = partial(OneCycleLR, total_steps=tmax, max_lr=lr)

model = get_model(general_leaky_relu, norm=nn.BatchNorm2d).apply(f_init_weights)

epochSched = BatchSchedCB(scheduler)
cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats, epochSched, rec]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs, opt_func=optim.Adam)
learn.fit(n_epochs)

In [None]:
rec.plot()

In [None]:
# 2 cool things from https://github.com/danielegrattarola/GINR/blob/master/src/models/graph_inr.py
optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)

scheduler = lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.5, patience=self.lr_patience, verbose=True
)