In [4]:
## Containers

## Modules

import torch 
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1  =nn.Conv2d(1,20,5)
        self.conv2 = nn.Conv2d(20,20,5)

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv1(x))
        return x
    
model = Model()

print(model)

dummy_input = torch.randn(1,1,28,28)

# output = model(dummy_input)

# print("Output shape:", output)

Model(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
)


In [9]:
## add_module

import torch
import torch.nn as nn

class DynamicModel(nn.Module):
    def __init__(self,input_size,hidden_sizes,output_size):
        super().__init__()

        self.add_module('input_layer',nn.Linear(input_size,hidden_sizes[0]))

        #Hidden Layers
        for i, hidden_size in enumerate(hidden_sizes):
            if i > 0:
                layer_name = f'hidden_layer_{i}'
                self.add_module(layer_name, nn.Linear(hidden_sizes[i-1], hidden_size))

        #output layer
        self.add_module('output_layer',nn.Linear(hidden_sizes[-1], output_size))

    def forward(self,x):
        for name, module in self.named_children():
            x = torch.relu(module(x))
        return x
    

input_size = 10

hidden_sizes = [20,30,40]

output_sizes = 5

model = DynamicModel(input_size, hidden_sizes, output_sizes)
print(model)

DynamicModel(
  (input_layer): Linear(in_features=10, out_features=20, bias=True)
  (hidden_layer_1): Linear(in_features=20, out_features=30, bias=True)
  (hidden_layer_2): Linear(in_features=30, out_features=40, bias=True)
  (output_layer): Linear(in_features=40, out_features=5, bias=True)
)


In [11]:
# add_module, adding predefined modules

import torch 
import torch.nn as nn

class CustomModel(nn.Module):
    def __init__(self):
        
        super().__init__()

        self.add_module('conv1', nn.Conv2d(1,32,kernel_size=3, stride=1, padding=1))

        sequential_layers = nn.Sequential(
            nn.Linear(32 * 28 , 128),
            nn.ReLU(),
            nn.Linear(128,10)
        )

        self.add_module('fc_layers', sequential_layers)

    def forward(self, x):
        x = torch.relu(self.conv1(x)) #apply ReLU after convolution
        x = x.view(x.size(0),-1)        #flatter the tensor
        x = self.fc_layers(x)           # pass through the sequential layers

#example usage
model = CustomModel()
print(model)

#test the model with dummy input
dummy_input = torch.randn(1,1,28,28)
# output = model(dummy_input)
# print("Output shape",output.shape)

CustomModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc_layers): Sequential(
    (0): Linear(in_features=896, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=10, bias=True)
  )
)


In [14]:
# apply(fn) Weight Initializatins

# use to initialize the weights of all layers in a neural network using a custom initialization function

import torch
import torch.nn as nn

#define a custom weight initialization function
def init_weights(m):
    if isinstance(m,nn.Linear): # check if the module is a LInear layer
        nn.init.xavier_uniform_(m.weight) #Xavier initializaiton for weights
        if m.bias is not None:
            nn.init.zeros_(m.bias) #zero-initializae biases

class FeedForwardNet(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super().__init__()
        self.input_layer = nn.Linear(input_size, hidden_sizes[0])
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_sizes[i],hidden_sizes[i + 1]) for i in range(len(hidden_sizes) - 1)
        ])
        self.output_layer = nn.Linear(hidden_sizes[-1],output_size)
    def forward(self,x):
        x = torch.relu(self.input_layers(x))
        for layer in self.hidden_layers:
            x = torch.relu(layer(x))
        return self.output_layer(x)
    
input_size = 10
hidden_sizes = [20,30,40]
output_size = 5
model = FeedForwardNet(input_size, hidden_sizes, output_size)

#apply the weight initialization function to all layers
model.apply(init_weights)

#print the initialized weights of the first layer 
print("Initialized weights of input_layer:")
print(model.input_layer.weight)



Initialized weights of input_layer:
Parameter containing:
tensor([[-0.3189, -0.1001,  0.0435, -0.0555, -0.2206,  0.3877,  0.2900, -0.4394,
         -0.0062,  0.2300],
        [-0.3620, -0.1434,  0.2258, -0.1466,  0.2818, -0.2664,  0.2420, -0.1008,
          0.3838,  0.3238],
        [-0.3508, -0.4227, -0.0360,  0.0292, -0.4013,  0.2699, -0.1715,  0.1462,
          0.0721,  0.4111],
        [ 0.0515,  0.2634, -0.2497,  0.3960, -0.1172, -0.2831,  0.0980,  0.0502,
         -0.0728, -0.2099],
        [-0.4028, -0.2895, -0.2639, -0.4414,  0.3371, -0.0018,  0.3418, -0.2585,
         -0.0889,  0.2634],
        [-0.0484,  0.2885,  0.3568,  0.1521,  0.0667,  0.3111, -0.3108,  0.1520,
          0.2754,  0.2477],
        [ 0.1167, -0.2664, -0.2696, -0.3815, -0.0566,  0.2637,  0.2939, -0.2938,
          0.2794,  0.3037],
        [ 0.4436,  0.2564,  0.2436, -0.3032, -0.0301,  0.3401, -0.1926, -0.1543,
          0.1444, -0.1874],
        [-0.2768,  0.1304, -0.4417, -0.2970, -0.2713, -0.1505,  0.3054

In [16]:
#apply(fn) Freezing Layers

# use to freeze specific layers (disable gradient computation) in a model

import torch 
import torch.nn as nn

def freeze_layers(m):
    if isinstance(m, nn.Linear): #Freeze only Linear layers
        for param in m.parameters():
            param.requires_grad = False


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)
        self.fc3 = nn.Linear(30,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    

#create the model
model = SimpleModel()

#apply the freeze function to all layers
model.apply(freeze_layers)

for name,param in model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")

fc1.weight: requires_grad = False
fc1.bias: requires_grad = False
fc2.weight: requires_grad = False
fc2.bias: requires_grad = False
fc3.weight: requires_grad = False
fc3.bias: requires_grad = False


In [17]:
#apply(fn) Logging Layer Informations

#log each layer information in the model

#define a loggin function

def log_layers(m):
    print(f"Layers: {m.__class__.__name__}, Parameters: {sum(p.numel() for p in m.parameters())}")

class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,32,kernel_size = 3, stride=1, padding = 1)
        self.conv2 = nn.Conv2d(1,32,kernel_size = 3, stride=1, padding = 1)
        self.fc1 = nn.Linear(64 * 7 * 7 , 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self,x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x,2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x,2)
        x = x.view(x.size(0) - 1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

model = ConvNet()
model.apply(log_layers)


"""

"""

Layers: Conv2d, Parameters: 320
Layers: Conv2d, Parameters: 320
Layers: Linear, Parameters: 401536
Layers: Linear, Parameters: 1290
Layers: ConvNet, Parameters: 403466


ConvNet(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [20]:
#bloat16()

# to cast all floating-point parameters and buffer of a model to bfloat16 data type.mro

# this is particularly supports bfloat16 ,suc as moden CPUs or TPUs

import torch 

import torch.nn as nn

#define a simple neural network
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)
        self.fc3 = nn.Linear(30,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
#create the model
model = SimpleModel()

#print the initial data types of the parameters
print('before bfloat16 conversion')
for name, param, in model.named_parameters():
    print(f"{name}:{param.dtype}")


#convert the model to bfloat16
model = model.bfloat16()

#print the data types after the conversion
print("\nAfter bfloat16 conversion:")
for name, param in model.named_parameters():
    print(f"{name}:{param.dtype}")

#test the model with dummy input in bfloat16
dummy_input = torch.rand(1,10,dtype=torch.bfloat16) #input in bfloat16
output = model(dummy_input)
print("\n Output shape", output.shape)
print("Output dtype:", output.dtype)


before bfloat16 conversion
fc1.weight:torch.float32
fc1.bias:torch.float32
fc2.weight:torch.float32
fc2.bias:torch.float32
fc3.weight:torch.float32
fc3.bias:torch.float32

After bfloat16 conversion:
fc1.weight:torch.bfloat16
fc1.bias:torch.bfloat16
fc2.weight:torch.bfloat16
fc2.bias:torch.bfloat16
fc3.weight:torch.bfloat16
fc3.bias:torch.bfloat16

 Output shape torch.Size([1, 5])
Output dtype: torch.bfloat16


In [22]:
#buffer(recursive = True)

# used to iterate over all buffers of a module (and optionally its submodules) . Buffers are tensors that are part fo the module but are not trainable parameters.
# Example include running statistics in BatchNorm layers or fixed embeddings

import torch 
import torch.nn as nn

#define a model with buffers
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #trainable parameters
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)

        #registering buffers
        self.register_buffer("running mean", torch.zeros(20))
        self.register_buffer("fixed_embeddings", torch.randn(5,10))

        #submodule with its own buffer
        self.submodule = nn.BatchNorm1d(30)

    def forwar(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.submodule(x)
    

model = MyModel()

print("all buffers (recurse = True):")
for buf in model.buffers(recurse = True):
    print(type(buf), buf.size())

#interate over only direct buffers (recurse = False)
for buf in model.buffers(recurse = False):
    print(type(buf),buf.size())

all buffers (recurse = True):
<class 'torch.Tensor'> torch.Size([20])
<class 'torch.Tensor'> torch.Size([5, 10])
<class 'torch.Tensor'> torch.Size([30])
<class 'torch.Tensor'> torch.Size([30])
<class 'torch.Tensor'> torch.Size([])
<class 'torch.Tensor'> torch.Size([20])
<class 'torch.Tensor'> torch.Size([5, 10])


In [26]:
# compile() 

# powerful feature introduced to optimize the execution of models by leveraging advanced compiler techniques
# compiles forward method of a torch.nn.Module using torch.compile() which can significantly imporve performance by optmizing the model computation graph

import torch
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)
        self.fc3 = nn.Linear(30,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

#create model and move it to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleModel().to(device)

#compile the model
compiled_model = torch.compile(model) #equivalent to calling model.compile

dummy_input = torch.randn(16,10,device = device)

output = compiled_model(dummy_input)

print("Output shape:", output.shape)

Output shape: torch.Size([16, 5])


In [27]:
# compile() Customization of compile

compiled_model = torch.compile(
    model,
    backend = "inductor",
    fullgraph=True,
    dynamic=False
)

output = compiled_model(dummy_input)
print("Output shape:", output.shape)

Output shape: torch.Size([16, 5])


In [28]:
#cuda(device = None)

#used to move al model parameters and buffer to a GPU for a sepcifc GPU if multiple are avaiable.
# important when you want to leverage GPU accelration for trainng or inference

import torch 
import torch.nn as nn 

#define a simple neural network
class SimpleMethod(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)
        self.fc3 = nn.Linear(30,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
model = SimpleModel()

#check the initial device of the model's parameters
print("Initial device of parameters")

for name , param in model.named_parameters():
    print(f"{name}:{param.device}")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#verify that the model's parameters are now on the GPU
print("\n Device of parameters after moving to GPU:")
#move the model to the GPU
for name, param in model.named_parameters():
    print(f"{name}:{param.device}")

#create dummy input data and move it to the same device
dummy_input = torch.rand(16,10).to(device)

output = model(dummy_input)
print("\n Output shape:", output.shape)
print("Output device", output.device)
    





Initial device of parameters
fc1.weight:cpu
fc1.bias:cpu
fc2.weight:cpu
fc2.bias:cpu
fc3.weight:cpu
fc3.bias:cpu

 Device of parameters after moving to GPU:
fc1.weight:cpu
fc1.bias:cpu
fc2.weight:cpu
fc2.bias:cpu
fc3.weight:cpu
fc3.bias:cpu

 Output shape: torch.Size([16, 5])
Output device cpu


In [30]:
#double cast all floating-point paramets and buffer of a model to the torch.float64 

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)
        self.fc3 = nn.Linear(30,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
model = SimpleModel()

print("Before double conversation")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}")

#convert the model to double precision
model = model.double()

print("\n After double conversation:")
for name, param in model.named_parameters():
    print(f"{name}:{param.dtype}")

#test the model with dummy input in double precision
dummy_input = torch.randn(1,10, dtype=torch.float64) #input in double precision
output = model(dummy_input)

print("\n OUtput shape:", output.shape)
print("Output dtype:", output.dtype)

Before double conversation
fc1.weight: torch.float32
fc1.bias: torch.float32
fc2.weight: torch.float32
fc2.bias: torch.float32
fc3.weight: torch.float32
fc3.bias: torch.float32

 After double conversation:
fc1.weight:torch.float64
fc1.bias:torch.float64
fc2.weight:torch.float64
fc2.bias:torch.float64
fc3.weight:torch.float64
fc3.bias:torch.float64

 OUtput shape: torch.Size([1, 5])
Output dtype: torch.float64


In [33]:
#eval()

# used to set model into evaluatio mode. important for certain layers or modules that behave diferently during raining and evaliatons
# for example dropout, randomly drops units during raining but does not drop any units during evaluation
# also batchnorm uses running statistics (mean and variance) during evaluation instead of batch statistics

import torch 
import torch.nn as nn

#define a model with dropout and batchnorm layers
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.dropout = nn.Dropout(p = 0.5)
        self.bn = nn.BatchNorm1d(20)
        self.fc2 = nn.Linear(20,5)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.bn(x)
        return self.fc2(x)
    
model = MyModel()

model.train()
print("Model in training mode:")
print(model)

#perform a forward pass with training mode
dummy_input = torch.randn(4,10)
output_train = model(dummy_input)
print("\n Outout in training mode:", output_train)

#set the model to evaluation mode
model.eval()
print("\n Model in evaluation mode:")
print(model)

#perform a forward pass with evaluatin mode
with torch.no_grad():
    output_eval = model(dummy_input)
print("\n Output in evaluation mode:", output_eval)


Model in training mode:
MyModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (bn): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)

 Outout in training mode: tensor([[-0.3455, -0.6189,  0.7044, -1.5280,  0.4210],
        [-0.9519, -0.2885,  0.3518,  1.1750,  0.5380],
        [-0.1468,  0.4640, -0.1338, -0.0340, -0.7176],
        [ 0.6771,  0.6091, -0.5829,  0.2539, -0.6382]],
       grad_fn=<AddmmBackward0>)

 Model in evaluation mode:
MyModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (bn): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)

 Output in evaluation mode: tensor([[-0.1544,  0.3611, -0.0028, -0.3084, -0.1717],
        [-0.1890,  0.0386,  0.1557,  0.0462, -0.0052],
      

In [34]:
#extra_repr
#override extra_repr methdinclude information about buffer 

import torch 
import torch.nn as nn 

#define a model with buffer
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #trainable parameters
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,30)

        #registering buffers
        self.register_buffer("running_mean", torch.zeros(20))
        self.register_buffer("fixed_embeddings", torch.randn(5,10))
        self.submodule = nn.BatchNorm1d(30)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.submodule(x)
    
    def extra_repr(self):
        return f"running mean: {self.running_mean.size()}, fixed_embeddings: {self.fixed_embeddings.size()}"

#instantiate the model
model = MyModel()
# print the model to see the extra represenation
print(model)
    
""""
this method is overriden to return a strin that provides additional information about the module,

it returns the sizes of the running_mean and fixed_embeddings buffers

"""

MyModel(
  running mean: torch.Size([20]), fixed_embeddings: torch.Size([5, 10])
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=30, bias=True)
  (submodule): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [36]:
#float 

import torch 
import torch.nn as nn

#define a simpel model with paraemters and buffers

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        #trainable parameter (default is float32)
        self.weight = nn.Parameter(torch.randn(3,3))

        #buffer (default is float32)
        self.register_buffer("running_mean", torch.zeros(3))

    def forward(self,x):
        return x @ self.weight + self.running_mean
    
model = MyModel()

#print the datatype of parameters and buffers before casting
print("Before float():")
print("Weight datatype:", model.weight.dtype)

print("Runnign mean datatype:", model.running_mean.dtype)

#cast all parameters and buffers to float (32-bit floating point)
model.float()

#print the datatype of parameters and buffers after casting
print("\n After float():")
print("Weight datatype:", model.weight.dtype)
print("Running mean datatype", model.running_mean.dtype)


model.double() #cast double precision
print("\n After float() again:")
print("Weight datatpye:", model.weight.dtype)
print("running mean datatype:", model.running_mean.dtype)

# casts all floating point parameter and buffer of the mode to the float datatpye torch.float32 

# dtyoe attrubte fo tensor indicates datatypes, by default pytorch uses toch.float32 

# float() double() is a in-place modification so no need to reassign the results.




Before float():
Weight datatype: torch.float32
Runnign mean datatype: torch.float32

 After float():
Weight datatype: torch.float32
Running mean datatype torch.float32

 After float() again:
Weight datatpye: torch.float64
running mean datatype: torch.float64


In [38]:
#forward 

#you should call model(x) directly instead of caling method.forward()
# this ensure that all any register hoooks (pre-forward , post-forward ) are executed. 

import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20) #fully connected layer 1
        self.fc2 = nn.Linear(20,10) # fully connected layer 2
        self.relu = nn.ReLU() #activation function

    def forward(self,x):
        #define the foward pass
        x = self.fc1(x) #pass input throgu the first layer
        x = self.relu(x) #apply ReLU activation 
        x = self.fc2(x) #pass through the second layer 
        return x 
    
model = MyModel()

#craete a random input tensor
input_tensor = torch.randn(5,10)
# batch of 5 samples , each with 10 features

#perform forward pass by calling model
output = model(input_tensor)

print("input tensor shape", input_tensor.shape)
print("Output tensor shape:", output.shape)
print("Output tensor:\n", output)


input tensor shape torch.Size([5, 10])
Output tensor shape: torch.Size([5, 10])
Output tensor:
 tensor([[-0.3420,  0.0117,  0.1513, -0.1669,  0.2344, -0.3389, -0.1194, -0.0338,
         -0.0648,  0.0806],
        [-0.0349,  0.1871,  0.2086, -0.0350,  0.3173, -0.3457, -0.0485, -0.0875,
         -0.2528,  0.0297],
        [-0.1342,  0.1871,  0.2244, -0.2599,  0.1047, -0.1203, -0.0630, -0.2873,
          0.1868,  0.4255],
        [-0.0844,  0.2295,  0.1941, -0.0726,  0.1734, -0.4147, -0.2767, -0.2628,
         -0.1395,  0.1654],
        [-0.2090,  0.2179,  0.0762,  0.2524,  0.3752,  0.1334,  0.0876, -0.1751,
         -0.0287,  0.1636]], grad_fn=<AddmmBackward0>)


In [None]:
#get_buffer(target)

# retrieve a specific buffer from a module using fully-qualified string name. 
# buffer are non-trainable tensors that are part of the module's state such as runnig statistics in BatchNorm

import torch 
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #register buffers
        self.register_buffer("running_mean", torch.zeros(10))

        self.register_buffer("fixed_embeddings", torch.randn(5,10))

        # add a submodule with its own buffer 
        self.submodule = nn.BatchNorm1d(10)
        self.submodule.register_buffer("running_var", torch.ones(10)) # buffer in submodule

    def forward(self,x):
        return self.submodule(x)
    
#instantiate the model
model = MyModel()

try:
    #get the  running_mean buffer from the main module
    running_mean = model.get_buffer("running_mean")
    print("Retrieved 'running_mean' buffer:")
    print(running_mean)

    #get the fixed_embeddigngs buffer fromt eh main module
    fixed_embeddings = model.get_buffer("fixed_embeddings")
    print("\nRetrieved 'fixed_embeddings' buffer:")
    print(fixed_embeddings)

    #get the running_var buffer from the submodule

    running_var = model.get_buffer("submodule.running_var")
    print("\nRetrieved 'submodule.running_var' buffer:")
    print(running_var)

    #attempt to get a non-existent buffer (will raiase error)
    non_existent_buffer = model.get_buffer("non_existent_buffer")

except AttributeError as e:
    print(f"\nError: {e}")


Retrieved 'running_mean' buffer:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Retrieved 'fixed_embeddings' buffer:
tensor([[-0.0379, -0.1254, -0.2757, -0.8203,  2.3026, -0.7584, -0.7222, -0.3929,
          1.5857, -0.5442],
        [ 0.8620,  0.5481, -0.0808, -1.3729, -0.0415,  0.2268,  1.9648,  0.1972,
          0.1739, -0.7899],
        [-1.8245,  0.7981,  0.7574,  0.3576, -0.0595,  0.8876,  0.1681,  1.2019,
         -0.3573, -1.1934],
        [ 0.3252,  1.9988,  1.9587, -0.1001, -0.7791, -0.1302, -0.7127, -1.3451,
          0.0548,  0.5923],
        [ 0.4716,  1.1301,  0.8396, -0.0492,  0.6691,  1.9465, -0.1013,  0.0844,
         -1.1672, -1.7064]])

Retrieved 'submodule.running_var' buffer:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

Error: MyModel has no attribute `non_existent_buffer`


In [40]:
#get_extra_sate() 

#return any additional state that should be included in module's state_dict

import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #trainable parameter
        self.weight = nn.Parameter(torch.randn(3,3))

        #extra state (non-tensor object)
        self.extra_state = {"description":"this is extra state","version": 1.0}

    def forward(self, x):
        return x @ self.weight 

    def get_extra_state(self):
        #return the extra state to be saved in the state_dict
        return self.extra_state
    
    def set_extra_state(self,state):
        self.extra_state = state 

model = MyModel()

# here we print the model's initial extra state
print("Initial extra state:", model.extra_state)

state_dict = model.state_dict()

#modify the extra state 
model.extra_state['version'] = 2.0 
print('\n Modified extra state:', model.extra_state)

#load the saved state_dict (restoring the original extra state)
model.load_state_dict(state_dict)

#print the restored extra state
print('\n Restored extra state:', model.extra_state)






Initial extra state: {'description': 'this is extra state', 'version': 1.0}

 Modified extra state: {'description': 'this is extra state', 'version': 2.0}

 Restored extra state: {'description': 'this is extra state', 'version': 2.0}


In [42]:
#get_parameter(target)

# retrieve specific parameter from a module using fully-qualified string name. 
# parameters are trainable tensors that are part of the module's state

import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #define trainable parameters 
        self.weight = nn.Parameter(torch.randn(3,3)) #parameter 1
        self.bias = nn.Parameter(torch.zeros(3)) # parameter 2

        #add a subodule with its own parameter
        self.submodule = nn.Linear(3,3)

    def forward(self,x):
        return self.submodule(x @ self.weight + self.bias)
    
#instantiate the model
model = MyModel()

try:
    #get the weight parameter from the main module
    weight = model.get_parameter("weight")
    print("Retrieved 'weight parameter:")
    print(weight)

    #get the bias parameter from the main module 
    bias = model.get_parameter('bias')
    print('\n Retrieved bias parameter:')
    print(bias)

    #get tje 'weight' parameter from the submodule 
    submodule_weight = model.get_parameter('submodule.weight')
    print('\nRetrieved submodule.weight parameter:')
    print(submodule_weight)

    non_existent_param = model.get_parameter('non_existet_param')
except AttributeError as e:
    print(f'\nError: {e}')




Retrieved 'weight parameter:
Parameter containing:
tensor([[ 0.7960,  1.2377, -0.7411],
        [-0.1748, -2.5137, -0.2854],
        [ 0.8596,  0.8421, -0.4042]], requires_grad=True)

 Retrieved bias parameter:
Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

Retrieved submodule.weight parameter:
Parameter containing:
tensor([[ 0.5092,  0.0822,  0.1512],
        [ 0.2763,  0.5701, -0.4478],
        [-0.2466, -0.5344, -0.0660]], requires_grad=True)

Error: MyModel has no attribute `non_existet_param`


In [None]:

#get_submodule 

# used to retrieve a specific submodule from a module usign a fully-qualified string name

# useful when working with deeply nested modules , as it aalows to directly access a submodule wihtout iterating thorugh the entire module hierarchy
import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.net_b = nn.Sequential(
            nn.Conv2d(16, 33, kernel_size=(3, 3)),  # Submodule 1
            nn.ReLU(),                             # Submodule 2
            nn.MaxPool2d(kernel_size=(2, 2))       # Submodule 3
        )

        self.linear = nn.Linear(100,200)

    def forward(self,x):
        x = self.net_b(x)
        x = x.view(x.size(0), -1) #flatten the tensor
        x = self.liner(x)
        return x 
    
model = MyModel()

try:
    net_b = model.get_submodule('net_b')
    print('Retrieved net_b submodule:')
    print(net_b)

    linear = model.get_submodule('linear')
    print('\nRetrieved linear submodule')
    print(linear)

    #get the first layer of 'net_b' (conv2d)
    conv_layer = model.get_submodule('net_b.0')
    print('\n Retreived net_b.0 submodule (Conv2d):')
    print(conv_layer)

    non_existent_submodule = model.get_submodule("non_existent_submodule")

except AttributeError as e:
    print(f'\nError: {e}')


Retrieved net_b submodule:
Sequential(
  (0): Conv2d(16, 33, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
)

Retrieved linear submodule
Linear(in_features=100, out_features=200, bias=True)

 Retreived net_b.0 submodule (Conv2d):
Conv2d(16, 33, kernel_size=(3, 3), stride=(1, 1))

Error: MyModel has no attribute `non_existent_submodule`


In [47]:
#half()

#used to cast all floating-point parameters and buffers of a module to the half dataype
# particularly useful when working with models that need to use half-precision floating-point numbers such as when optimizing for memory usage or when

import torch 
import torch.nn as nn 

#define a simple model with parameters and buffers
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        #trainable parameter (default is float32)
        self.weight = nn.Parameter(torch.randn(3,3))

        self.register_buffer('running_mean',torch.zeros(3))
    def forward(self,x):
        return x @ self.weight + self.running_mean
    
model = MyModel()

print('Before half():')
print('Weight datatype:', model.weight.dtype)
print('Running mean datatype', model.running_mean.dtype) #should be torch.float32

#cast all parameters and buffers to half (16-bit floating point)
model.half()

#print the datatpye of parameters and buffer after casting 
print('\n After half():')
print('Weight datatype:', model.weight.dtype)
print('Running mean datatype:', model.running_mean.dtype)

#example with double precision 64-bit floating point
model.double()
print('\nAfter double():')
print('Wegiht datatype:', model.weight.dtype)
print('Running mean datatype:', model.running_mean.dtype)

#cast back to half 16-bit floating point
model.half()
print('\n after half() again:')
print('weight datatype:', model.weight.dtype)
print('running mean datatype', model.running_mean.dtype)

Before half():
Weight datatype: torch.float32
Running mean datatype torch.float32

 After half():
Weight datatype: torch.float16
Running mean datatype: torch.float16

After double():
Wegiht datatype: torch.float64
Running mean datatype: torch.float64

 after half() again:
weight datatype: torch.float16
running mean datatype torch.float16


In [49]:
#ipu(device = None)

# method specific to Graphcore's IPU (intelligence processing unit) hardware
# used to move al model parameters and buffers ot the IPU device
# moving models to CPUs or GPUs , but specific for IPU hardware

import torch
import torch.nn as nn
import poptorch  # Graphcore's PopTorch library for IPU support

# Define a simple model
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 10)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
model = MyModel()

# Print the device of model parameters before moving to IPU
print("Before moving to IPU:")
for name, param in model.named_parameters():
    print(f"{name} is on device: {param.device}")

# Move the model to IPU
model.ipu()  # Move all parameters and buffers to IPU

# Print the device of model parameters after moving to IPU
print("\nAfter moving to IPU:")
for name, param in model.named_parameters():
    print(f"{name} is on device: {param.device}")

# Optional: Specify a specific IPU device
model.ipu(device=1)  # Move to IPU device 1

# Print the device of model parameters after moving to a specific IPU device
print("\nAfter moving to IPU device 1:")
for name, param in model.named_parameters():
    print(f"{name} is on device: {param.device}")

ModuleNotFoundError: No module named 'poptorch'

In [50]:
#load_state_dict(state_dict, strict = True, assign =False)

#used to load the state of a model (parameters and buffers) from a dictionary state_dict

#commonly used to load pretrained model


import torch
import torch.nn as nn

# Define a simple model
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 10)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
model = MyModel()

# Save the model's state_dict (simulating a checkpoint)
checkpoint = model.state_dict()

# Modify the model (e.g., add a new layer)
model.fc3 = nn.Linear(10, 5)

# Attempt to load the saved state_dict into the modified model
try:
    # Load with strict=True (default)
    model.load_state_dict(checkpoint, strict=True)
except RuntimeError as e:
    print(f"Error when strict=True: {e}")

# Load with strict=False (ignore missing or unexpected keys)
result = model.load_state_dict(checkpoint, strict=False)
print("\nLoading with strict=False:")
print("Missing keys:", result.missing_keys)  # Keys in the model but not in the checkpoint
print("Unexpected keys:", result.unexpected_keys)  # Keys in the checkpoint but not in the model

# Example with assign=True
# Create a new model and load the checkpoint with assign=True
new_model = MyModel()
new_model.load_state_dict(checkpoint, assign=True)

# Verify that the parameters are loaded correctly
print("\nParameters after loading with assign=True:")
for name, param in new_model.named_parameters():
    print(f"{name}: {param}")

Error when strict=True: Error(s) in loading state_dict for MyModel:
	Missing key(s) in state_dict: "fc3.weight", "fc3.bias". 

Loading with strict=False:
Missing keys: ['fc3.weight', 'fc3.bias']
Unexpected keys: []

Parameters after loading with assign=True:
fc1.weight: Parameter containing:
tensor([[ 9.3234e-02, -1.0406e-01,  2.0072e-01,  1.0068e-01, -2.6003e-01,
         -6.9717e-02, -1.3650e-01,  3.0242e-01, -5.5836e-02,  1.6532e-01],
        [ 2.0841e-01, -2.1912e-01,  1.9119e-01, -4.9851e-02,  5.7902e-02,
         -1.9379e-02,  2.4167e-01, -1.0543e-01, -1.3502e-01,  7.4048e-02],
        [-1.5762e-01, -2.3877e-01, -2.5111e-01,  2.1742e-01, -2.3223e-01,
         -2.0470e-01,  1.5878e-02, -9.4169e-02, -6.3247e-02, -5.5704e-02],
        [ 1.6315e-04, -1.3842e-01, -1.3518e-01, -7.3822e-02,  2.1311e-01,
          1.4191e-01,  4.4531e-02, -1.4384e-01, -1.2044e-01,  1.2828e-01],
        [-2.4574e-01, -2.9267e-01, -1.1814e-01, -2.8761e-01,  2.9364e-01,
          9.9065e-02,  3.1010e-01,  2

In [55]:
#modules() return an interator over all module sin the network, include top-level module and its submodule

#useful for inspecting or modifying all parts of a model. 

import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        
        super().__init__()
        #define layers 
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,10)

        #define a sequantial submodule
        self.submodule = nn.Sequential(
            nn.Conv2d(1,16,kernel_size =3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
model = MyModel()

#iterative over all modules in the network 
print('All modules in the network:')
for idx, module in enumerate(model.modules()):
    print(f"{idx} -> {module}")

#example with shared layers 
shared_layer = nn.Linear(2,2)
net = nn.Sequential(shared_layer, shared_layer)

print('\nModules in a network with a shared layers:')
for ifx , module in enumerate(net.modules()):
    print(f'{idx}->{module}')


All modules in the network:
0 -> MyModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (submodule): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
)
1 -> Linear(in_features=10, out_features=20, bias=True)
2 -> Linear(in_features=20, out_features=10, bias=True)
3 -> Sequential(
  (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
4 -> Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
5 -> ReLU()
6 -> MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)

Modules in a network with a shared layers:
6->Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
6->Linear(in_features=2, out_features=2, bias

In [None]:
#tia(device = None)

# specific to Meta' MTIA (Meta Training and Inference Accelerator) hardware 

# used to move all model parameters and buffers to the MTIA device .index
# analogous to the `to(device)` method used for moving models to CPUs or GPUs but its tailored for MTIA hardware

# since MITA-speficic funtionality is not standard PyTorck library , and require Meta's epcilaized framework. so no code will be generated

In [56]:
# named_buffer(prefix ='', recurse = True, remove_duplicate = True)

# used to iterate oer all buffers in a module , yielding both the name of the buffer in a module , yileding both the name of the niffer and the bffer itself.

# buffers are non trainable tenosrs that are part of the module's state such as runnig statistics in BatchNorm layers or fixed embeddings 


import torch 
import torch.nn as nn

class Mymode(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("running_mean", torch.zeros(10)) #buffer 1
        self.register_buffer('fixed_embeddings',torch.randn(5,10))  #buffer 2

        #add a submodule with it own buffer 
        self.submodule = nn.BatchNorm1d(10)
        self.submodule.register_buffer('running_var', torch.ones(10)) #bufer in submodule 

    def forward(self,x):
        return self.submodule()
    
model = MyModel()

#interate over all buffers in the model
print('All buffers in the model:')
for name, buf in model.named_buffers():
    print(f"{name}:{buf.size()}")

#iterate over only direct buffers (recurse = False)
print('\nDirect buffers in the model (recurse = False):')

for name, buf in model.named_buffers(recurse = False):
    print(f"{name}: {buf.size()}")

print('\n Accessing the running__var buffer in the submodule:')
for name , buf in model.named_buffers(recurse = False):
    print(f'{name}:{buf.size()}')

#iterate over buffers with a custom prefix 
print('\nBuffers iwth prefix model.')
for name, buf in model.named_buffers(prefix = 'model'):
    print(f'{name}:{buf.size()}')

#access a specific buffer by name 
print('\n Accessing the running_var buffer in the submodule')
for name, buf in model.named_buffers():
    if name == "submodule.running_var":
        print(f"{name}: {buf}")




All buffers in the model:

Direct buffers in the model (recurse = False):

 Accessing the running__var buffer in the submodule:

Buffers iwth prefix model.

 Accessing the running_var buffer in the submodule


In [None]:
# named_children

https://chat.deepseek.com/a/chat/s/47c58dc4-c246-4404-b900-8cfadc9bbee8

https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module

In [2]:
#named_children 

#to iterate over the immediate modules of a model, yield both name of each child module and the module itself 

# useful to inspect or manipulate specifc submoduels ofa model by their naems

import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #immediate child modules
        self.fc1 = nn.Linear(10,20) #fully connected layer 1
        self.fc2 = nn.Linear(20,30)  #fully connected layer 2

        #submodule (nested module)

        self.submodule = nn.Sequential(
            nn.Conv2d(1,32, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2)
        )

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        return x

model = MyModel()

#iterate over immediate child modules using named_children()

print("Immedate child module with names:")
for name , child in model.named_children():
    print(f"{name}: {child}")

# example , access specific child modules by name
print("\n Accessing specific child modules:")
for name, child in model.named_children():
    if name in ['fc1','submodule']:
        print(f"Found {name}: {child}")


Immedate child module with names:
fc1: Linear(in_features=10, out_features=20, bias=True)
fc2: Linear(in_features=20, out_features=30, bias=True)
submodule: Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

 Accessing specific child modules:
Found fc1: Linear(in_features=10, out_features=20, bias=True)
Found submodule: Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)


In [5]:
#named_modules 

#used to recursively iterate over all modules in a model
# yield both the name of each moduel and the module itself, inclide its immediate children all nested submodules

import torch 
import torch.nn as nn 

#define a model with multiple layers and submodules
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        #immediate child modules
        self.fc1 = nn.Linear(10,20) #fully connected layer
        self.fc2 = nn.Linear(20,30) #fully connected layer

        #submodule (nested module)
        self.submodule = nn.Sequential(
            nn.Conv2d(1,32,kernel_size=3,stride =1,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2)
        )

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x 

model = MyModel()

#iterate over al modules using named_modules()
print("All modules with names:")
for name, module in model.named_modules():
    print(f"{name}:{module}")

#model.named_modules iterates over all modules , including the model itself its immediate children and all nested submodules





All modules with names:
:MyModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=30, bias=True)
  (submodule): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
)
fc1:Linear(in_features=10, out_features=20, bias=True)
fc2:Linear(in_features=20, out_features=30, bias=True)
submodule:Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
submodule.0:Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
submodule.1:ReLU()
submodule.2:MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)


In [6]:
#named_parameters(prefix='',recurse=True,remove_duplicate=True)

#return an interator over module parameters, yielding both the name the parameters as well as the parameter itself

import torch 
import torch.nn as nn 

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        #immediate child modules
        self.fc1 = nn.Linear(10,20) #fully connected layer 1
        self.fc2 = nn.Linear(20,30) #fully connected layer 2

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x 
    
model = MyModel()

#iterate over all parameters using named_parameters()
print("All parameters with names:")
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

#example : access specific parameters by name 
print("\n Accessing specific parmeters:")
for name, param in model.named_parameters():
    if 'bias' in name:
        print(f"Found bias {name}, Size: {param.size()}")


All parameters with names:
fc1.weight: torch.Size([20, 10])
fc1.bias: torch.Size([20])
fc2.weight: torch.Size([30, 20])
fc2.bias: torch.Size([30])

 Accessing specific parmeters:
Found bias fc1.bias, Size: torch.Size([20])
Found bias fc2.bias, Size: torch.Size([30])


In [8]:
#parameters(recurse = True)

import torch 
import torch.nn as nn 


class MyModl(nn.Module):
    def __init__(self):
        super().__init__()
        #immediate child modules
        self.fc1 = nn.Linear(10,20) #fully connected layer 1
        self.fc2 = nn.Linear(20,20) #fully connected layer 2

        #submodule (nested module)
        self.submodule = nn.Sequential(
            nn.Conv2d(1,32,kernel_size=3, stride=1,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size =2 )
        )

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x 
    
model = MyModel()

print("All parameters")
for param in model.parameters():
    print(f"Type : {type(param)}, Size: {param.size()}")

#example : passing parameters to an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print('\n Optimizer parameters:')
for param_group in optimizer.param_groups:
    for param in param_group['params']:
        print(f"type : {type(param)}, Size: {param.size()}")

All parameters
Type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([20, 10])
Type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([20])
Type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([30, 20])
Type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([30])

 Optimizer parameters:
type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([20, 10])
type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([20])
type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([30, 20])
type : <class 'torch.nn.parameter.Parameter'>, Size: torch.Size([30])


In [10]:
#register_full_backward deprecated use register_full_backward_hook()

# hook trigered during backward pass , allow to inspect or modify gradients

import torch 
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(3,1)

    def forward(self,x):
        return self.fc(x)
    
#define a full backward hook function 
def full_backward_hook(module, grad_input,grad_output):
    print("Full Backward Hook:")
    print(f"Module : {module}")
    print(f"Gradient Inpit: {grad_input}")
    print(f"Gradient Output: {grad_output}")

# create the model
model = SimpleModel()

#register the full backward hook on the fully conntected layer
hook_handle = model.fc.register_full_backward_hook(full_backward_hook)

#foward pass 
x = torch.tensor([[1.0,2.0,3.0]], requires_grad = True)
output = model(x)
output.backward()

hook_handle.remove()

Full Backward Hook:
Module : Linear(in_features=3, out_features=1, bias=True)
Gradient Inpit: (tensor([[-0.5285,  0.1666,  0.3691]]),)
Gradient Output: (tensor([[1.]]),)


In [16]:
#register_buffer() 

# used to register a tensor as a buffer in a module. 
# unlike parameters buffer are not trainable bu they are part of the module;s state and will be saved in state_dict
# buffer use to run statistics in BatchNorm layers or fixed embeddings 

import torch 
import torch.nn as nn 

#define a model with custom buffers 
class MyModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        #register a buffer for running mean 
        self.register_buffer('running_mean', torch.zeros(num_features)) #parameter for running mean
        self.register_buffer('fixed_embedding', torch.randn(num_features,10)) #parameter for fixed embeddings
        self.weight = nn.Parameter(torch.randn(10, 5)) #parameter for comparison

    def forward(self,x):
        #use the running mean buffer in the forward pass
        x = x - self.running_mean.unsqueeze(0) #subtract running mean from input
        x = x @ self.fixed_embedding # matrix multiplication
        #use the trainable weight parameter
        return x @ self.weight 

#create the model
num_features = 20
model = MyModel(num_features)

#print the mode state_duct to see the registered buffer and parameters
print('Model state_dict')
for name, param in model.state_dict().items():
    print(f"{name}:{param.size()}")

#access the buffers as attributes
print('\nAccessing buffers:')
print('running_mean:', model.running_mean)
print('fixed_embeddings:', model.fixed_embedding)

#perform a foward pass
dummy_input = torch.randn(4, num_features)
output = model(dummy_input)
print('\nOutput shape:', output.shape)


Model state_dict
weight:torch.Size([10, 5])
running_mean:torch.Size([20])
fixed_embedding:torch.Size([20, 10])

Accessing buffers:
running_mean: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
fixed_embeddings: tensor([[-1.1692,  0.9233, -1.3743, -0.0412, -0.6657, -0.4694,  1.1795,  0.0048,
         -0.2036, -0.9306],
        [-0.2119,  1.9368, -0.1809,  1.2844,  0.4287, -0.4781, -0.6198, -1.3535,
         -0.1595, -1.6089],
        [-1.6512, -0.5887, -0.7414,  0.5946,  0.7417,  2.4140,  1.3768,  1.9308,
         -0.2244, -0.5625],
        [ 1.0054, -0.9115, -0.0934, -2.0115, -1.6865,  0.8628, -1.7416,  0.5485,
         -0.2257, -0.6000],
        [-0.9187, -0.5879, -0.0720,  0.6459,  0.2475,  0.4282,  0.4157,  1.3314,
         -0.4986,  0.0343],
        [-1.7999, -0.8960, -1.8708, -1.6024,  0.2459,  0.6762,  0.8101,  0.1302,
         -0.8468, -0.2464],
        [ 1.2704, -0.2843,  0.8097,  0.2247, -0.2556,  0.6125,  0.9254, -0.4050,
         -1.1

In [None]:
#register_forward_hook register a hook that is executed after the forward()
# useful for inspecting or modifying the inputs an dputptus of a module during the forward pas

import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)  # Fully connected layer 1
        self.fc2 = nn.Linear(20, 5)   # Fully connected layer 2

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# Create the model
model = SimpleModel()

# Define a forward hook function
def forward_hook(module, args, output):
    print("\nForward Hook:")
    print(f"Module: {module}")
    print(f"Input (args): {args}")  # Inputs to the module
    print(f"Output: {output}")      # Output from the module

    # Optionally modify the output
    output = output * 2  # Double the output
    print(f"Modified Output: {output}")
    return output  # Return the modified output

# Register the forward hook on the second fully connected layer (fc2)
hook_handle = model.fc2.register_forward_hook(forward_hook)

# Perform a forward pass
dummy_input = torch.randn(4, 10)  # Batch size: 4, Input features: 10
output = model(dummy_input)

print("\nFinal Output:")
print(output)


Forward Hook:
Module: Linear(in_features=20, out_features=5, bias=True)
Input (args): (tensor([[0.9887, 0.9279, 0.0000, 1.6666, 0.9149, 0.1565, 0.0000, 0.0000, 0.4082,
         0.0000, 1.0419, 0.0000, 0.0000, 0.0000, 0.3589, 0.6129, 0.6456, 0.7229,
         0.0000, 0.5827],
        [0.0000, 0.0000, 0.0000, 0.5168, 0.0000, 0.0000, 0.0000, 0.8494, 0.0000,
         0.0000, 0.6775, 0.7629, 0.0000, 0.0000, 0.0000, 0.1395, 0.0000, 0.2999,
         0.1937, 0.0000],
        [0.7397, 0.8591, 0.0000, 0.8880, 0.7910, 0.2205, 0.0000, 1.5006, 0.0000,
         0.0000, 0.0414, 1.5801, 0.0000, 0.0000, 0.0000, 0.3989, 0.0000, 0.4936,
         0.0000, 0.1047],
        [0.0000, 0.0000, 1.7948, 0.0000, 0.4380, 0.0000, 0.4965, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9101, 0.0000, 0.3110, 0.0000,
         0.0396, 1.0976]], grad_fn=<ReluBackward0>),)
Output: tensor([[-0.0539, -0.1256,  0.1501, -0.5554, -0.4071],
        [-0.2102, -0.0720, -0.3877, -0.2682, -0.0719],
        [-0.37

In [21]:
#register_forward_pre_hook()
# al;ows you to register a hook that is executed before the forward() method of a module is called
# this is useful for inspecting or modifting the inputs to a module before they are processed 

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,5)
    
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)
    
model = SimpleModel()

def forward_pre_hook(module, args):
    print('\nForward Pre-Hook')
    print(f'ModuleL {module}')
    print(f'Input (args):{args}')

    modified_input = args[0] * 2 
    print(f'Modified Input:{modified_input}')
    return (modified_input,) #return modifed input as tuple

hook_handle = model.fc1.register_forward_pre_hook(forward_pre_hook)

dummy_input = torch.randn(4,10)
output = model(dummy_input)

print('\nFinal Output')
print(output)




Forward Pre-Hook
ModuleL Linear(in_features=10, out_features=20, bias=True)
Input (args):(tensor([[ 0.2961,  1.2313, -0.4940, -0.0205, -0.0507,  0.4216, -1.8702,  0.4123,
         -0.1422, -0.2483],
        [-0.0921, -1.5266,  2.1024, -0.4903,  0.9804, -0.0072,  0.4120, -0.7442,
         -0.1839,  1.0655],
        [-0.3003, -2.0432,  0.8264,  1.7306,  1.1460, -1.4550,  0.8153,  1.3339,
         -0.5971,  0.4146],
        [-0.1170, -1.0051,  0.6520, -1.0000,  1.0511,  0.5122, -1.0182,  0.7830,
          0.5140,  0.3190]]),)
Modified Input:tensor([[ 0.5922,  2.4626, -0.9881, -0.0409, -0.1013,  0.8433, -3.7404,  0.8246,
         -0.2845, -0.4965],
        [-0.1843, -3.0532,  4.2049, -0.9805,  1.9607, -0.0144,  0.8240, -1.4884,
         -0.3677,  2.1309],
        [-0.6005, -4.0863,  1.6528,  3.4612,  2.2920, -2.9099,  1.6307,  2.6678,
         -1.1943,  0.8291],
        [-0.2340, -2.0102,  1.3040, -1.9999,  2.1022,  1.0244, -2.0363,  1.5661,
          1.0279,  0.6381]])

Final Output
tens

In [22]:
#register_full_backward_hook()

# allows you to register a hook that is executed during the backward_pass when gradients are computed 
# usefil for inspecting or modifying gradients as the flow through the model.
# grad_input and grad_output are tuples containing gradeints with respect to the inputs and outputs of the module , respetifcly

import torch 
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10,5)

    def forward(self,x):
        return self.fc(x)

model = SimpleModel()

def full_backward_hook(module, grad_input, grad_output):
    print('\n Full Backward Hook;')
    print(f'Module {module}')
    print(f'Gradient Input: {grad_input}')
    print(f'Gradient Output: {grad_output}')

    #optionally modify the gradient input
    modified_grad_input = [g * 2 if g is not None else None for g in grad_input]
    print(f'Modified Gradient Input: {modified_grad_input}')
    return modified_grad_input #return the modified gradient

hook_handle = model.fc.register_full_backward_hook(full_backward_hook)

dummy_input = torch.randn(4,10,requires_grad=True)
output = model(dummy_input)

loss = output.sum()
loss.backward()

hook_handle.remove()



 Full Backward Hook;
Module Linear(in_features=10, out_features=5, bias=True)
Gradient Input: (tensor([[ 0.4963, -0.3214,  0.1429, -0.0377, -0.3262, -0.0023, -0.9644,  0.5581,
          0.0258, -0.7973],
        [ 0.4963, -0.3214,  0.1429, -0.0377, -0.3262, -0.0023, -0.9644,  0.5581,
          0.0258, -0.7973],
        [ 0.4963, -0.3214,  0.1429, -0.0377, -0.3262, -0.0023, -0.9644,  0.5581,
          0.0258, -0.7973],
        [ 0.4963, -0.3214,  0.1429, -0.0377, -0.3262, -0.0023, -0.9644,  0.5581,
          0.0258, -0.7973]]),)
Gradient Output: (tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]]),)
Modified Gradient Input: [tensor([[ 0.9925, -0.6427,  0.2857, -0.0755, -0.6524, -0.0046, -1.9287,  1.1163,
          0.0517, -1.5946],
        [ 0.9925, -0.6427,  0.2857, -0.0755, -0.6524, -0.0046, -1.9287,  1.1163,
          0.0517, -1.5946],
        [ 0.9925, -0.6427,  0.2857, -0.0755, -0.6524, -0.0046, -1.9287,  1.1163,

In [23]:
#register_full_backward_pre_hook()
# allow to register backward pre-hook on a module
# the hook is executed before the gradeitns for the modele are computed during the backward pass
# provides access to the gradients with respect to the module's outputs and allows you to optionally modify them

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10,5)
    
    def forward(self,x):
        return self.fc(x)
    
model = SimpleModel()

def backward_pre_hook(module, grad_output):
    print('\n Bakcward Pre-Hook:')
    print(f'Module : {module}')
    print(f'Gradient Output (before computation): {grad_output}')

    modified_grad_output = [g*2 if g is not None else None for g in grad_output]
    print(f'Modified Gradient Output:{modified_grad_output}')
    return modified_grad_output

hook_handle = model.fc.register_full_backward_pre_hook(backward_pre_hook)

dummy_input = torch.randn(4,10,requires_grad=True) #batch size 4 input features 10
output = model(dummy_input)

#perform a backward pass 
loss = output.sum()
loss.backward()

#remove the hook after use
hook_handle.remove()


 Bakcward Pre-Hook:
Module : Linear(in_features=10, out_features=5, bias=True)
Gradient Output (before computation): (tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]]),)
Modified Gradient Output:[tensor([[2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.]])]


In [28]:
#register_load_state_dict_post_hook()

#allows to register a post hook that is executed after the load_state_dict method od a module is called
# can be ised to inspect or modify the incompatible_keys 

import torch 
import torch.nn as nn

class SimpleMethod(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,5)

#creat two models one for saving and one for loading
model_to_save = SimpleMethod()
model_to_load = SimpleMethod()

#modift the state dict of the model_to_save to simulate incopatible keys
state_dict = model_to_save.state_dict()
del state_dict['fc2.weight'] #simulate a missing key
state_dict['unexpected_key'] = torch.randn(5) #simluate an unexpected key

#define a post hook function
def load_state_dict_post_hook(module, incompatible_keys):
    print('\n load state dict post-hook:')
    print(f'Module: {module}')
    print(f'Incomaptible keys before modifications: {incompatible_keys}')

    #modift incompatible_keys inpace
    incompatible_keys.missing_keys[:] = [key for key in incompatible_keys.missing_keys if 'fc2' not in key]
    incompatible_keys.unexpected_keys.clear()

hook_handle = model_to_load.register_load_state_dict_post_hook(load_state_dict_post_hook)
#load the modified state dict into the model_to_load
print('Loading state_dict.. ')
model_to_load.load_state_dict(state_dict, strict = False)

#remove the hook after use
hook_handle.remove()


Loading state_dict.. 

 load state dict post-hook:
Module: SimpleMethod(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
Incomaptible keys before modifications: _IncompatibleKeys(missing_keys=['fc2.weight'], unexpected_keys=['unexpected_key'])


In [11]:
#register_load_state_dict_pre_hook()

# allows you to register a pre-hook that is executed before the load_state_dict()
# hook can be used to inspect or modify the state_dict befor eit is loaded into the model

import torch 
import torch.nn as nn 
#modify the state_dict of the model_to_save to simulate incompatible keys


class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,5)

model_to_save = SimpleModel()
model_to_load = SimpleModel()


state_dict = model_to_save.state_dict()
del state_dict['fc2.weight'] #stimulate a missing key
state_dict['unexpected_key'] = torch.randn(5)

#define a pre-hook function
def load_state_dict_pre_hook(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
    print("\nLoad State Dict Pre-Hook:")
    print(f"Module: {module}")
    print(f"State Dict Before Modification: {list(state_dict.keys())}")

    # Add the missing 'fc2.weight' key with default initialization
    if 'fc2.weight' not in state_dict:
        print("Adding missing 'fc2.weight' key to state_dict...")
        state_dict['fc2.weight'] = torch.zeros_like(module.fc2.weight)

    # Remove the unexpected key
    if 'unexpected_key' in state_dict:
        print("Removing unexpected 'unexpected_key' from state_dict...")
        del state_dict['unexpected_key']

    print(f'State DIct After Modification: {list(state_dict.keys())}')

hook_handle = model_to_load.register_load_state_dict_pre_hook(load_state_dict_pre_hook)

# Load the modified state_dict into the model_to_load
print("Loading state_dict...")
model_to_load.load_state_dict(state_dict, strict=True)

# Remove the hook after use
hook_handle.remove()


Loading state_dict...

Load State Dict Pre-Hook:
Module: SimpleModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
State Dict Before Modification: ['fc1.weight', 'fc1.bias', 'fc2.bias', 'unexpected_key']
Adding missing 'fc2.weight' key to state_dict...
Removing unexpected 'unexpected_key' from state_dict...
State DIct After Modification: ['fc1.weight', 'fc1.bias', 'fc2.bias', 'fc2.weight']


In [None]:
#register_module alias for add_module

In [19]:
#register_parameter 

#allows you to explicitly add a parameter to a module
# useful when you want to define custom parameters that are not directly created by laters like nn.Linear or nn.Conv2d
# paramters registered using register_parameter() are treated as trainable and will be included in the module's state_dict 

import torch 
import torch.nn as nn 

class CustomModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()

        #register a custom trainable parameter
        self.register_parameter('custom_weight', nn.Parameter(torch.randn(num_features)))

        #a regular fully connected layer for comparison
        self.fc = nn.Linear(num_features, 1)

    def forward(self,x):
        #use the custom parameter in the forward pass
        x = x * self.custom_weight
        return self.fc(x)
    

model = CustomModel(num_features=5)
print('Model state_dict')
for name, param in model.state_dict().items():
    print(f"{name}:{param.size()}")

#access the custom parameter as an attribute
print('\n Accessing custom parameter:')
print('custom_weight:',model.custom_weight)

dummy_input = torch.randn(4, num_features)
output = model(dummy_input)
print('\nOutput shape:', output.shape)


Model state_dict
custom_weight:torch.Size([5])
fc.weight:torch.Size([1, 5])
fc.bias:torch.Size([1])

 Accessing custom parameter:
custom_weight: Parameter containing:
tensor([ 0.1465, -0.9626, -0.5179, -1.7237,  1.6209], requires_grad=True)

Output shape: torch.Size([4, 1])


In [25]:
#register_state_dict_post_hook()

# use to register a post-hook that is executed after the state_dict() method of a module is called.
# this hook can be used to inspect or modify the state_dict before it is returned

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20 , 5)

model = SimpleModel()

def state_dict_post_hook(module,state_dict, prefix , local_metadata):
    print('\n State Dict Post-Hook:')
    print(f'Module: {module}')
    print(f'State Dict Before Modification : {list(state_dict.keys())}')

    if 'fc2.weight' in state_dict:
        print('Removing fc2.weight from state_dict')
        del state_dict['fc2.weight']
    
    if 'custom_key' not in state_dict:
        print('Adding custom_key to state_dict')
        state_dict['custom_key'] = torch.tensor([42.0])

    print(f'State Dict After Modification: {list(state_dict.keys())}')

hook_handle = model.register_state_dict_post_hook(state_dict_post_hook)

print('Calling state_dict()')
state_dict = model.state_dict()
for key, value in state_dict.items():
    print(f'{key}:{value.size() if isinstance(value, torch.Tensor) else value}')

hook_handle.remove()


Calling state_dict()

 State Dict Post-Hook:
Module: SimpleModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
State Dict Before Modification : ['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias']
Removing fc2.weight from state_dict
Adding custom_key to state_dict
State Dict After Modification: ['fc1.weight', 'fc1.bias', 'fc2.bias', 'custom_key']
fc1.weight:torch.Size([20, 10])
fc1.bias:torch.Size([20])
fc2.bias:torch.Size([5])
custom_key:torch.Size([1])


In [26]:
#register_state_dict_pre_hook()

# register a pre-hook that is executed before the state_dict() method of a module is called
# this hook can be usde to inspect or modift the state of the model before the state_dict is generated

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,5)

        #add a buffer that will be included in the state_dict
        self.register_buffer('custom_buffer', torch.ones(5))

#create the model
model = SimpleModel()

#define a state_dict pre-hook function
def state_dict_pre_hook(module, prefix, keep_vars):
    print('\nState Dict Pre-Hook')
    print(f'Module : {module}')
    print(f'Prefix: {prefix}')
    print(f'Keep Vars: {keep_vars}')

    if hasattr(module, 'custom_buffer'):
        print('Modifying custom_buffer before state_dict()')
        module.custom_buffer *= 2
hook_handle = model.register_state_dict_pre_hook(state_dict_pre_hook)
print('Calling state_dict()...')
state_dict = model.state_dict()
print('\n Final State Dict:')
for key, value in state_dict.items():
    print(f'{key}: {value}')
hook_handle.remove()


Calling state_dict()...

State Dict Pre-Hook
Module : SimpleModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
Prefix: 
Keep Vars: False
Modifying custom_buffer before state_dict()

 Final State Dict:
custom_buffer: tensor([2., 2., 2., 2., 2.])
fc1.weight: tensor([[ 1.1114e-02,  2.6485e-01,  7.1813e-02,  6.4798e-02,  2.4110e-01,
          2.2708e-01,  2.1536e-01,  4.4444e-02,  2.1228e-01, -5.4929e-02],
        [ 1.1388e-01,  2.9384e-01, -4.6838e-02,  2.3594e-01,  7.6249e-02,
          5.2240e-02,  1.6616e-01,  1.8772e-01, -2.1465e-01,  1.7900e-01],
        [-2.3969e-01, -1.5891e-02,  3.0292e-01, -2.0144e-01,  4.7090e-02,
         -1.7050e-01,  1.3062e-01,  1.1197e-01, -1.1315e-02,  2.5014e-01],
        [-1.8129e-01, -1.2033e-01, -1.7847e-01, -1.2628e-01,  2.4070e-01,
         -7.5194e-02,  2.7970e-01,  1.3442e-01, -1.1633e-01,  5.6283e-02],
        [-2.7280e-01,  8.8173e-02, -2.4256e-01, -2.0110e-01, -1.3094e-

In [29]:
#register_state_dict_pre_hook()

#method allows to regiter hook that is executed before the state_dict method of a module is called
# this hook can be used to inspect or modift the state of the model before the state_dict is generated

import torch 
import torch.nn as nn 

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10,20)
        self.fc2 = nn.Linear(20,5)

        self.register_buffer('custom_buffer', torch.ones(5))

model = SimpleModel()

def state_dict_pre_hook(module, prefix,keep_vars):
    print('\nState Dict Pre-Hook:')
    print(f'Module: {module}')
    print(f'Prefix: {prefix}')
    print(f'Keep Vars: {keep_vars}')

    #modify the model's state before generating the state_dict
    if hasattr(module, 'custom_buffer'):
        print('Modifying custom_buffer before state_dict()')
        module.custom_buffer *=2 

hook_handle = model.register_state_dict_pre_hook(state_dict_pre_hook)

#call state_dict() and observe the modifications
print('calling state_dict()')
state_dict = model.state_dict()
print('\nFinal State Dict:')
for key, value in state_dict.items():
    print(f'{key}:{value}')
hook_handle.remove()

calling state_dict()

State Dict Pre-Hook:
Module: SimpleModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
Prefix: 
Keep Vars: False
Modifying custom_buffer before state_dict()

Final State Dict:
custom_buffer:tensor([2., 2., 2., 2., 2.])
fc1.weight:tensor([[-0.1155,  0.2292,  0.0573, -0.3049, -0.0555,  0.1106,  0.0952, -0.2587,
         -0.1162,  0.0899],
        [-0.2827, -0.1889,  0.2165,  0.1744,  0.0199,  0.2662,  0.1119,  0.2396,
         -0.0439,  0.3129],
        [-0.2989,  0.1222, -0.0627, -0.0303, -0.0262, -0.0352,  0.1268, -0.2019,
         -0.1527, -0.0622],
        [ 0.2302, -0.0513,  0.1327, -0.0952,  0.1496, -0.1316,  0.1000,  0.0267,
         -0.2992, -0.1106],
        [-0.2221, -0.1115,  0.1748, -0.2518,  0.1159,  0.2741,  0.0249,  0.0375,
          0.2054,  0.2271],
        [-0.2687, -0.0807, -0.1639,  0.0248, -0.0882, -0.0107, -0.2943,  0.0908,
          0.0523,  0.0212],
        [ 0.0494, 

In [31]:
#set_extra_state() used to hadnel extra state stored in state_dict
# useful when want to save an dload custom attributes or metadat that are not part of the standard parameters or buffers of a module

import torch 
import torch.nn as nn 

class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10,5)

        #custom attribute to store extra state
        self.custom_info = 'Initial Value'

    def forward(self,x):
        return self.fc(x)
    
def get_extra_state(self):
    """
    Retrieve the extra state to be saved in the sate_dict
    """

    print('Retrieving extra state...')
    return {'custom_info':self.custom_info}

def set_extra_state(self, state):
    '''
    Set the extra state loaded from the state_dict
    '''
    print('Setting extra state...')
    self.custom_info = state.get('custom_info', 'Default Value')

model = CustomModel()
#prnit the model's state_dict
print('Saving state_dict')
torch.save(model.state_dict(), 'model_state.pth')

#modify the custom_info attirbute
model.custom_info ='Modified Value'
print(f'Custom info before loading: {model.custom_info}')

print('\nLoading state_dict...')
state_dict = torch.load('model_state.pth')
print(f'Custom info after loading:{model.custom_info}')


Saving state_dict
Custom info before loading: Modified Value

Loading state_dict...
Custom info after loading:Modified Value


In [42]:
#set_submodule()

#replace submodule within a model using fully-qualified name 
# useful when you want to dynamically modify parts of your model 9repacling a layer or swapping entire submodules


import torch
import torch.nn as nn

# Define a nested model
class NestedModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Create submodules
        self.net_b = nn.Module()  # Parent submodule
        self.net_b.net_c = nn.Module()  # Nested submodule
        self.net_b.net_c.conv = nn.Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))  # Conv2d layer
        self.net_b.linear = nn.Linear(100, 200)  # Linear layer

    def forward(self, x):
        # Use the conv layer in net_b.net_c
        return self.net_b.net_c.conv(x)

# Create the model
model = NestedModel()

# Print the initial structure of the model
print("Initial Model Structure:")
print(model)

# Replace the Conv2d layer with a Linear layer
new_linear_layer = nn.Linear(33, 16)
model.set_submodule("net_b.net_c.conv", new_linear_layer)

# Modify the forward pass to handle the Linear layer
def modified_forward(self, x):
    # Flatten the input for the Linear layer
    x = x.view(x.size(0), -1)  # Reshape to [batch_size, features]
    return self.net_b.net_c.conv(x)

# Update the model's forward method
model.forward = modified_forward.__get__(model)

# Print the modified structure of the model
print("\nModified Model Structure:")
print(model)

# # Perform a forward pass with dummy input
# dummy_input = torch.randn(1, 16, 10, 10)  # Input tensor for the Conv2d layer
# output = model(dummy_input)
# print("\nOutput shape:", output.shape)

Initial Model Structure:
NestedModel(
  (net_b): Module(
    (net_c): Module(
      (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
    )
    (linear): Linear(in_features=100, out_features=200, bias=True)
  )
)

Modified Model Structure:
NestedModel(
  (net_b): Module(
    (net_c): Module(
      (conv): Linear(in_features=33, out_features=16, bias=True)
    )
    (linear): Linear(in_features=100, out_features=200, bias=True)
  )
)


In [43]:
#share_memory() 

#used to move a tensor or module to shared memory
# enable it o share acorss multiple processes
# particularly useful in multiprocessing scenarios such as distrubuted training or parallel data loading where tensros or model parameters need to be accessed by multiple workers

import torch 
import torch.nn as nn
from multiprocessing import Process

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10,5)

    def forward(self,x):
        return self.fc(x)
    
model = SimpleModel()

model.share_memory()

print('Model Parameters After share_memory()')

for name, param in model.named_parameters():
    print(f'{name}:{param.is_shared()}')

#define a funtion to simulate a worker process
def worker_process(model, input_tensor):
    output = model(input_tensor) # perform a foward pass in the worker process
    print(f'Output in worker process: {output}')

shared_input = torch.randn(4,10)
shared_input.share_memory_()

process1 = Process(target=worker_process, args=(model, shared_input))
process2 = Process(target=worker_process, args=(model, shared_input))

process1.start()
process2.start()

process1.join()
process2.join()

Model Parameters After share_memory()
fc.weight:True
fc.bias:True
Output in worker process: tensor([[-0.4966,  0.0919, -0.3421,  0.5134, -0.1519],
        [ 0.7050, -0.7392,  0.6057,  0.0027, -0.8711],
        [-0.6415, -0.4897,  0.2836,  0.4208, -0.1635],
        [-0.5252, -0.0490,  0.0269,  0.8973, -0.1599]],
       grad_fn=<AddmmBackward0>)
Output in worker process: tensor([[-0.4966,  0.0919, -0.3421,  0.5134, -0.1519],
        [ 0.7050, -0.7392,  0.6057,  0.0027, -0.8711],
        [-0.6415, -0.4897,  0.2836,  0.4208, -0.1635],
        [-0.5252, -0.0490,  0.0269,  0.8973, -0.1599]],
       grad_fn=<AddmmBackward0>)


In [44]:
#state_dict()

# retunrs a dictionary containing the entire sate of a module including its parametres and parsistent buffers.
#useful for saving and loading models as well as inspecting their internal state


import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 5)

        # Register a buffer
        self.register_buffer('running_mean', torch.zeros(5))

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Create the model
model = SimpleModel()

# Print the keys of the state_dict
print("Keys in state_dict:")
for key in model.state_dict().keys():
    print(key)

# Access the full state_dict
print("\nFull state_dict:")
for key, value in model.state_dict().items():
    print(f"{key}: {value}")

# Save the state_dict to a file
torch.save(model.state_dict(), "model_state.pth")
print("\nState dict saved to 'model_state.pth'.")

# Load the state_dict from the file into a new model
new_model = SimpleModel()
new_model.load_state_dict(torch.load("model_state.pth"))
print("\nState dict loaded into new model.")

# Verify that the new model has the same state
print("\nNew model's state_dict:")
for key, value in new_model.state_dict().items():
    print(f"{key}: {value}")

Keys in state_dict:
running_mean
fc1.weight
fc1.bias
fc2.weight
fc2.bias

Full state_dict:
running_mean: tensor([0., 0., 0., 0., 0.])
fc1.weight: tensor([[-0.2667, -0.1428, -0.3120,  0.0955,  0.0934, -0.2612, -0.0013, -0.0882,
          0.0570, -0.2672],
        [-0.1139,  0.0882,  0.2662, -0.1231, -0.2092, -0.1870, -0.1551,  0.2779,
          0.2236, -0.2545],
        [ 0.2204,  0.2844,  0.0528,  0.0328, -0.0867, -0.0033, -0.2744,  0.2961,
          0.2466,  0.1153],
        [-0.2506, -0.0140, -0.1792, -0.1127, -0.2620, -0.2979, -0.1651,  0.1818,
          0.2578,  0.2291],
        [ 0.1642,  0.1232,  0.3156, -0.3137, -0.0374, -0.1578, -0.0875,  0.2946,
          0.0517, -0.0965],
        [-0.0508,  0.0968,  0.2908,  0.1018,  0.0919, -0.0079,  0.1661, -0.1900,
          0.1851, -0.0072],
        [-0.2535,  0.0816, -0.1212,  0.0435, -0.1359, -0.2821,  0.2410,  0.2737,
          0.2468,  0.1387],
        [-0.2934,  0.0301,  0.1314, -0.0677, -0.2594, -0.1470, -0.0372, -0.2091,
         -

In [None]:
# to()

# move and/or cast the parameters and buffers of amoudle to a specified device or memory format
# useful when switchin between CPU an dGPU , chaing precision like from flaot32 to float1 , ensuring comapbility with specifc tensor formats


import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(2, 2)  # Fully connected layer

    def forward(self, x):
        return self.fc(x)

# Create the model
model = SimpleModel()

# Print initial state of the model's parameters
print("Initial Model Parameters:")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}, {param.device}")

# Move the model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("\nAfter Moving to Device:", device)
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}, {param.device}")

# Cast the model's parameters to half-precision (float16)
model.to(dtype=torch.float16)

print("\nAfter Casting to Half-Precision (float16):")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}, {param.device}")

# Move the model back to CPU and cast to double-precision (float64)
model.to(device=torch.device("cpu"), dtype=torch.float64)

print("\nAfter Moving to CPU and Casting to Double-Precision (float64):")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}, {param.device}")

# Perform a forward pass with a tensor on the same device and dtype
input_tensor = torch.ones(3, 2, dtype=torch.float64, device=torch.device("cpu"))
output = model(input_tensor)
print("\nForward Pass Output:")
print(output)

Initial Model Parameters:
fc.weight: torch.float32, cpu
fc.bias: torch.float32, cpu

After Moving to Device: cpu
fc.weight: torch.float32, cpu
fc.bias: torch.float32, cpu

After Casting to Half-Precision (float16):
fc.weight: torch.float16, cpu
fc.bias: torch.float16, cpu

After Moving to CPU and Casting to Double-Precision (float64):
fc.weight: torch.float64, cpu
fc.bias: torch.float64, cpu

Forward Pass Output:
tensor([[ 0.2896, -0.5687],
        [ 0.2896, -0.5687],
        [ 0.2896, -0.5687]], dtype=torch.float64, grad_fn=<AddmmBackward0>)


In [46]:
#to_empty()

#move parameters and buffers of module to specified device wih=thout copying their store
# use when you watn to allocate memory for tensors on a specific device wihtout transfering their current values. 
# tensor initialized as empty on the target device

import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)  # Fully connected layer 1
        self.fc2 = nn.Linear(20, 5)   # Fully connected layer 2

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Create the model
model = SimpleModel()

# Print initial state of the model's parameters (on CPU)
print("Initial Model Parameters:")
for name, param in model.named_parameters():
    print(f"{name}: {param.device}, {param.dtype}")

# Move the model to a GPU (if available) using to_empty()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to_empty(device=device)

print("\nAfter Moving to Device (Empty Storage):", device)
for name, param in model.named_parameters():
    print(f"{name}: {param.device}, {param.dtype}")

# Check if the tensors are empty (uninitialized)
print("\nChecking if tensors are uninitialized:")
for name, param in model.named_parameters():
    print(f"{name}: {param.data_ptr()}")  # Data pointer indicates uninitialized storage

# Perform a forward pass with dummy input on the same device
input_tensor = torch.randn(4, 10, device=device)  # Batch size: 4, Input features: 10
output = model(input_tensor)
print("\nForward Pass Output Shape:", output.shape)

Initial Model Parameters:
fc1.weight: cpu, torch.float32
fc1.bias: cpu, torch.float32
fc2.weight: cpu, torch.float32
fc2.bias: cpu, torch.float32

After Moving to Device (Empty Storage): cpu
fc1.weight: cpu, torch.float32
fc1.bias: cpu, torch.float32
fc2.weight: cpu, torch.float32
fc2.bias: cpu, torch.float32

Checking if tensors are uninitialized:
fc1.weight: 439769728
fc1.bias: 439655424
fc2.weight: 439123840
fc2.bias: 439104128

Forward Pass Output Shape: torch.Size([4, 5])


In [47]:
#train(mode=True))

#used to set the module into training mode or evaluation mode
# important for moduels like Dropout , BatchNorm and otehrs that behave differently during trainig versus evaluation

import torch
import torch.nn as nn

# Define a model with Dropout and BatchNorm layers
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)
        self.dropout = nn.Dropout(p=0.5)  # Dropout layer
        self.bn = nn.BatchNorm1d(20)      # BatchNorm layer
        self.fc2 = nn.Linear(20, 5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.bn(x)
        return self.fc2(x)

# Create the model
model = SimpleModel()

# Print initial mode of the model
print("Initial Mode:")
print(f"Model in training mode: {model.training}")

# Set the model to training mode
model.train()
print("\nAfter Setting to Training Mode:")
print(f"Model in training mode: {model.training}")
print(f"Dropout active: {model.dropout.training}")
print(f"BatchNorm in training mode: {model.bn.training}")

# Perform a forward pass in training mode
input_tensor = torch.randn(4, 10)  # Batch size: 4, Input features: 10
output_train = model(input_tensor)
print("\nOutput in Training Mode (with Dropout and BatchNorm):")
print(output_train)

# Set the model to evaluation mode
model.eval()
print("\nAfter Setting to Evaluation Mode:")
print(f"Model in training mode: {model.training}")
print(f"Dropout active: {model.dropout.training}")
print(f"BatchNorm in training mode: {model.bn.training}")

# Perform a forward pass in evaluation mode
output_eval = model(input_tensor)
print("\nOutput in Evaluation Mode (without Dropout and BatchNorm updates):")
print(output_eval)

Initial Mode:
Model in training mode: True

After Setting to Training Mode:
Model in training mode: True
Dropout active: True
BatchNorm in training mode: True

Output in Training Mode (with Dropout and BatchNorm):
tensor([[ 0.0759, -0.3735, -0.0089,  0.0603, -0.7545],
        [-0.2788,  0.5420,  0.4794, -1.0626, -0.4399],
        [ 0.1067,  0.7035, -0.4835,  0.2310,  0.3309],
        [-0.4473, -0.5017, -0.1215,  0.6419,  0.3413]],
       grad_fn=<AddmmBackward0>)

After Setting to Evaluation Mode:
Model in training mode: False
Dropout active: False
BatchNorm in training mode: False

Output in Evaluation Mode (without Dropout and BatchNorm updates):
tensor([[ 0.0606,  0.1784, -0.0317, -0.4120, -0.2390],
        [-0.0865,  0.1308,  0.3176, -0.8944, -0.5810],
        [ 0.1183,  0.1333,  0.1105, -0.6027, -0.2551],
        [ 0.1547,  0.1821, -0.1626, -0.4336, -0.0600]],
       grad_fn=<AddmmBackward0>)


In [None]:

#type(dst_type)

#cast all parameters and buffers of modue to a specified type (dst_type)
#useful when you want to change the data type of the entier model for mixed precision trainig or other purposes
import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)  # Fully connected layer 1
        self.fc2 = nn.Linear(20, 5)   # Fully connected layer 2

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Create the model
model = SimpleModel()

# Print initial data types of the model's parameters
print("Initial Data Types:")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}")

# Cast the model's parameters and buffers to float16
model.type(torch.float16)

print("\nAfter Casting to float16:")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}")

# Perform a forward pass with a dummy input tensor of float16
input_tensor = torch.randn(4, 10, dtype=torch.float16)  # Batch size: 4, Input features: 10
output = model(input_tensor)
print("\nForward Pass Output Shape:", output.shape)
print("Output Data Type:", output.dtype)

# Cast the model back to float32
model.type(torch.float32)

print("\nAfter Casting Back to float32:")
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}")

Initial Data Types:
fc1.weight: torch.float32
fc1.bias: torch.float32
fc2.weight: torch.float32
fc2.bias: torch.float32

After Casting to float16:
fc1.weight: torch.float16
fc1.bias: torch.float16
fc2.weight: torch.float16
fc2.bias: torch.float16

Forward Pass Output Shape: torch.Size([4, 5])
Output Data Type: torch.float16

After Casting Back to float32:
fc1.weight: torch.float32
fc1.bias: torch.float32
fc2.weight: torch.float32
fc2.bias: torch.float32


In [49]:
#xpu(device=None) is used to move all model parameters and buffers to the XPU device
# the XPU is intel extenssion for pytorch
# allow to leverage intel GPU for deep learning workloads
# method similar to cuda() or cpu()

import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 20)  # Fully connected layer 1
        self.fc2 = nn.Linear(20, 5)   # Fully connected layer 2

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Create the model
model = SimpleModel()

# Print initial device of the model's parameters
print("Initial Device:")
for name, param in model.named_parameters():
    print(f"{name}: {param.device}")

# Move the model to the XPU device
if torch.xpu.is_available():  # Check if XPU is available
    model.xpu()  # Move the model to the default XPU device
    print("\nAfter Moving to XPU:")
    for name, param in model.named_parameters():
        print(f"{name}: {param.device}")
else:
    print("\nXPU is not available. Skipping XPU operations.")

# Perform a forward pass with a dummy input tensor on the XPU
if torch.xpu.is_available():
    input_tensor = torch.randn(4, 10).xpu()  # Batch size: 4, Input features: 10
    output = model(input_tensor)
    print("\nForward Pass Output Shape:", output.shape)
    print("Output Device:", output.device)

Initial Device:
fc1.weight: cpu
fc1.bias: cpu
fc2.weight: cpu
fc2.bias: cpu

XPU is not available. Skipping XPU operations.


In [1]:
#zero_grad(set_to_none=True)

#used to reset the gradients of all model parameters. , done at begining of each traning iteration to ensure that gradients from previous iterations do not accumulate
# by default , setting set_to_none = True improves performance by freeing the memory oocupied bu the gradients instead of zeroing them out

import torch
import torch.nn as nn

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10, 5)  # Fully connected layer

    def forward(self, x):
        return self.fc(x)

# Create the model and a dummy input
model = SimpleModel()
input_tensor = torch.randn(4, 10)  # Batch size: 4, Input features: 10
target = torch.randn(4, 5) 

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

print('First Training Step:')
output = model(input_tensor)
loss = criterion(output,target)
loss.backward() #compute gradietns

print("Gradients before zero_grad():")
for name, param in model.named_parameters():
    print(f"{name}: {param.grad}")

# Reset gradients using zero_grad()
model.zero_grad(set_to_none=True)
print("\nGradients after zero_grad():")
for name, param in model.named_parameters():
    print(f"{name}: {param.grad}")

# Perform another training step
print("\nSecond Training Step:")
output = model(input_tensor)
loss = criterion(output, target)
loss.backward()  # Compute gradients again
print("Gradients after second backward():")
for name, param in model.named_parameters():
    print(f"{name}: {param.grad}")

First Training Step:
Gradients before zero_grad():
fc.weight: tensor([[-0.0099,  0.1420,  0.0129,  0.0040, -0.5605,  0.2090, -0.3024,  0.5251,
         -0.2248, -0.5372],
        [ 0.1390,  0.2457,  0.1510, -0.1780,  0.4095, -0.1225, -0.2330,  0.1415,
         -0.0484,  0.2039],
        [-0.1058, -0.0588, -0.0575,  0.0257, -0.2989,  0.1459, -0.0364,  0.0697,
         -0.0378, -0.2662],
        [-0.1883,  0.0963,  0.0410, -0.2689,  0.0857,  0.1972, -0.3440,  0.0561,
         -0.0434, -0.2374],
        [ 0.4255,  0.3597,  0.3066, -0.2412,  0.9135, -0.3818, -0.1576, -0.0387,
         -0.2885,  0.3558]])
fc.bias: tensor([ 0.3990, -0.1158,  0.1630,  0.0883, -0.1840])

Gradients after zero_grad():
fc.weight: None
fc.bias: None

Second Training Step:
Gradients after second backward():
fc.weight: tensor([[-0.0099,  0.1420,  0.0129,  0.0040, -0.5605,  0.2090, -0.3024,  0.5251,
         -0.2248, -0.5372],
        [ 0.1390,  0.2457,  0.1510, -0.1780,  0.4095, -0.1225, -0.2330,  0.1415,
         -