In [74]:
%load_ext autoreload 
%autoreload 2
from copy import deepcopy
from collections import OrderedDict
from datetime import datetime
import torch
import torch.nn.functional as F
from torch import nn, optim

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Module

In [2]:

class Model(nn.Module):
    def __init__(self, m1, m2):
        super().__init__()
        self.m1 = m1
        self.m2 = m2
        self.linear = nn.Linear(3, 1)
        self.register_buffer("slope", torch.randn(1))
        self.slope = torch.tensor(1.0)
        self.apply(self.reset_parameters)
    
    def reset_parameters(self, m):
        print("model: ", type(m), id(m))
    
        
    def forward(self, x):
        x1 = x.clone()
        x = self.m1(x)
        x = self.m2(x)
        x = self.linear(x)
        x = F.sigmoid(x1.sum() + x) 
        return x


class M1(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Linear(10, 5)
    
    def forward(self, x):
        return  self.net(x)
        
class M2(nn.Module):
    def __init__(self):
        super().__init__()
        self.p = nn.Parameter(torch.Tensor(5, 3))
        self.apply(self.reset_parameters)
        
    def reset_parameters(self, m):
        print("m2: ", type(m))
        m.p.data.fill_(1.0)
        print()
    

    def forward(self, x):
        return x @ self.p
    
    def __repr__(self):
        return f"M2(p_size={self.p.size()})"


In [3]:
m1 = M1()
m2 = M2()
model = Model(m1, m2)

m2:  <class '__main__.M2'>

model:  <class 'torch.nn.modules.linear.Linear'> 140130153656912
model:  <class '__main__.M1'> 140134722709872
model:  <class '__main__.M2'> 140134722771120
model:  <class 'torch.nn.modules.linear.Linear'> 140130153656864
model:  <class '__main__.Model'> 140134721473936


In [4]:
id(m1.net), id(m1), id(m2), id(model.linear), id(model)

(140130153656912,
 140134722709872,
 140134722771120,
 140130153656864,
 140134721473936)

In [5]:
m2.p

Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)

In [6]:
for ch in model.children():
    print(ch)

M1(
  (net): Linear(in_features=10, out_features=5, bias=True)
)
M2(p_size=torch.Size([5, 3]))
Linear(in_features=3, out_features=1, bias=True)


In [7]:
for ch in model.named_children():
    print(ch)

('m1', M1(
  (net): Linear(in_features=10, out_features=5, bias=True)
))
('m2', M2(p_size=torch.Size([5, 3])))
('linear', Linear(in_features=3, out_features=1, bias=True))


In [8]:
for ch in model.modules():
    print(type(ch))
    print(ch)
    print()

<class '__main__.Model'>
Model(
  (m1): M1(
    (net): Linear(in_features=10, out_features=5, bias=True)
  )
  (m2): M2(p_size=torch.Size([5, 3]))
  (linear): Linear(in_features=3, out_features=1, bias=True)
)

<class '__main__.M1'>
M1(
  (net): Linear(in_features=10, out_features=5, bias=True)
)

<class 'torch.nn.modules.linear.Linear'>
Linear(in_features=10, out_features=5, bias=True)

<class '__main__.M2'>
M2(p_size=torch.Size([5, 3]))

<class 'torch.nn.modules.linear.Linear'>
Linear(in_features=3, out_features=1, bias=True)



In [9]:
model.extra_repr()

''

In [10]:
model.slope

tensor(1.)

In [11]:
model.get_buffer("slope")

tensor(1.)

In [12]:
model.m1.net.weight

Parameter containing:
tensor([[-0.2175, -0.2462,  0.1401,  0.0677,  0.1236,  0.0503, -0.0525,  0.2588,
          0.0390, -0.0355],
        [-0.0897,  0.0304, -0.0405, -0.1622, -0.0644,  0.0040,  0.2439,  0.1909,
          0.2963, -0.2859],
        [-0.2546,  0.2203,  0.1783, -0.0237,  0.2078, -0.1258,  0.1093,  0.2051,
          0.1070,  0.1059],
        [-0.1937, -0.2630, -0.1800,  0.1465, -0.0797,  0.0929,  0.1548, -0.2313,
         -0.2101,  0.2310],
        [-0.2429, -0.1604, -0.0051, -0.0184,  0.0735,  0.2035,  0.0530, -0.0518,
          0.1792, -0.1293]], requires_grad=True)

In [13]:
model.get_parameter("m1.net.weight")

Parameter containing:
tensor([[-0.2175, -0.2462,  0.1401,  0.0677,  0.1236,  0.0503, -0.0525,  0.2588,
          0.0390, -0.0355],
        [-0.0897,  0.0304, -0.0405, -0.1622, -0.0644,  0.0040,  0.2439,  0.1909,
          0.2963, -0.2859],
        [-0.2546,  0.2203,  0.1783, -0.0237,  0.2078, -0.1258,  0.1093,  0.2051,
          0.1070,  0.1059],
        [-0.1937, -0.2630, -0.1800,  0.1465, -0.0797,  0.0929,  0.1548, -0.2313,
         -0.2101,  0.2310],
        [-0.2429, -0.1604, -0.0051, -0.0184,  0.0735,  0.2035,  0.0530, -0.0518,
          0.1792, -0.1293]], requires_grad=True)

In [14]:
model.get_submodule("m2")

M2(p_size=torch.Size([5, 3]))

In [15]:
model.get_submodule("m1.net")

Linear(in_features=10, out_features=5, bias=True)

In [16]:
for buff in model.buffers():
    print(buff)

tensor(1.)


In [17]:
for buff in model.named_buffers():
    print(buff)

('slope', tensor(1.))


In [18]:
for p in model.named_parameters():
    print(p)

('m1.net.weight', Parameter containing:
tensor([[-0.2175, -0.2462,  0.1401,  0.0677,  0.1236,  0.0503, -0.0525,  0.2588,
          0.0390, -0.0355],
        [-0.0897,  0.0304, -0.0405, -0.1622, -0.0644,  0.0040,  0.2439,  0.1909,
          0.2963, -0.2859],
        [-0.2546,  0.2203,  0.1783, -0.0237,  0.2078, -0.1258,  0.1093,  0.2051,
          0.1070,  0.1059],
        [-0.1937, -0.2630, -0.1800,  0.1465, -0.0797,  0.0929,  0.1548, -0.2313,
         -0.2101,  0.2310],
        [-0.2429, -0.1604, -0.0051, -0.0184,  0.0735,  0.2035,  0.0530, -0.0518,
          0.1792, -0.1293]], requires_grad=True))
('m1.net.bias', Parameter containing:
tensor([-0.2522, -0.0312, -0.2755, -0.2277, -0.1391], requires_grad=True))
('m2.p', Parameter containing:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], requires_grad=True))
('linear.weight', Parameter containing:
tensor([[ 0.3180,  0.1390, -0.5150]], requires_grad=True))
('linear.bias', Pa

In [19]:
model._get_name()

'Model'

In [20]:
model.m1._get_name()

'M1'

In [21]:
x = torch.randn(10, 10)
model(x)

tensor([[0.7893],
        [0.7694],
        [0.7625],
        [0.7560],
        [0.7539],
        [0.7758],
        [0.7542],
        [0.7660],
        [0.7843],
        [0.7587]], grad_fn=<SigmoidBackward0>)

In [22]:
model

Model(
  (m1): M1(
    (net): Linear(in_features=10, out_features=5, bias=True)
  )
  (m2): M2(p_size=torch.Size([5, 3]))
  (linear): Linear(in_features=3, out_features=1, bias=True)
)

In [23]:
model.add_module("linear2", nn.Linear(model.linear.out_features, model.linear.out_features))

In [24]:
for ch in model.children():
    print(ch)

M1(
  (net): Linear(in_features=10, out_features=5, bias=True)
)
M2(p_size=torch.Size([5, 3]))
Linear(in_features=3, out_features=1, bias=True)
Linear(in_features=1, out_features=1, bias=True)


In [25]:
for m in model.modules():
    print(m)

Model(
  (m1): M1(
    (net): Linear(in_features=10, out_features=5, bias=True)
  )
  (m2): M2(p_size=torch.Size([5, 3]))
  (linear): Linear(in_features=3, out_features=1, bias=True)
  (linear2): Linear(in_features=1, out_features=1, bias=True)
)
M1(
  (net): Linear(in_features=10, out_features=5, bias=True)
)
Linear(in_features=10, out_features=5, bias=True)
M2(p_size=torch.Size([5, 3]))
Linear(in_features=3, out_features=1, bias=True)
Linear(in_features=1, out_features=1, bias=True)


In [26]:
model_copy = deepcopy(model)

In [27]:
model_copy.register_module("linear2", nn.Linear(model.linear.out_features, 4))

In [28]:
model_copy

Model(
  (m1): M1(
    (net): Linear(in_features=10, out_features=5, bias=True)
  )
  (m2): M2(p_size=torch.Size([5, 3]))
  (linear): Linear(in_features=3, out_features=1, bias=True)
  (linear2): Linear(in_features=1, out_features=4, bias=True)
)

In [29]:
model_copy.register_parameter("p", nn.Parameter(torch.zeros(4, 5)))

In [30]:
model_copy.p

Parameter containing:
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], requires_grad=True)

In [31]:
model_copy.register_parameter("p", nn.Parameter(torch.zeros(6, 5)))

In [32]:
model_copy.p

Parameter containing:
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], requires_grad=True)

In [33]:
list(model_copy.linear.parameters())

[Parameter containing:
 tensor([[ 0.3180,  0.1390, -0.5150]], requires_grad=True),
 Parameter containing:
 tensor([0.3672], requires_grad=True)]

In [34]:
# Freeze
model_copy.linear.requires_grad_(False)

Linear(in_features=3, out_features=1, bias=True)

In [35]:
list(model_copy.linear.parameters())

[Parameter containing:
 tensor([[ 0.3180,  0.1390, -0.5150]]),
 Parameter containing:
 tensor([0.3672])]

In [38]:
model_copy

Model(
  (m1): M1(
    (net): Linear(in_features=10, out_features=5, bias=True)
  )
  (m2): M2(p_size=torch.Size([5, 3]))
  (linear): Linear(in_features=3, out_features=1, bias=True)
  (linear2): Linear(in_features=1, out_features=4, bias=True)
)

In [42]:
torch.manual_seed(0)
N = 4
din = 10
dout = 4
x = torch.randn(N, din)
w_hat = torch.rand(din, dout)
y =  x @ w_hat + torch.tensor([1.])

In [43]:
x

tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
          0.3223, -1.2633],
        [ 0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473, -1.3527, -1.6959,
          0.5667,  0.7935],
        [ 0.5988, -1.5551, -0.3414,  1.8530, -0.2159, -0.7425,  0.5627,  0.2596,
         -0.1740, -0.6787],
        [ 0.9383,  0.4889,  1.2032,  0.0845, -1.2001, -0.0048, -0.5181, -0.3067,
         -1.5810,  1.7066]])

In [44]:
y

tensor([[-2.1714, -1.6280,  0.6360,  0.0610],
        [ 3.1769,  2.2039,  0.7222,  2.8604],
        [ 0.3377,  1.5358,  0.6844, -0.0122],
        [ 2.2432,  1.1925, -0.9026,  0.2081]])

In [48]:
parameters = {name: deepcopy(p) for name, p in model_copy.named_parameters()}

In [52]:
list(parameters.keys())

['p',
 'm1.net.weight',
 'm1.net.bias',
 'm2.p',
 'linear.weight',
 'linear.bias',
 'linear2.weight',
 'linear2.bias']

In [49]:
torch.manual_seed(0)
train_model = deepcopy(model_copy)
adam = optim.Adam(train_model.parameters(), lr=1)
criterion = nn.MSELoss()
train_model.train()
train_model.zero_grad()
yp = train_model(x)
loss = criterion(yp, y)
loss.backward()
adam.step()

  return F.mse_loss(input, target, reduction=self.reduction)


In [51]:
for name, old_p in parameters.items():
    print(name, old_p, train_model.get_parameter(name))

p Parameter containing:
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], requires_grad=True) Parameter containing:
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], requires_grad=True)
m1.net.weight Parameter containing:
tensor([[-0.2175, -0.2462,  0.1401,  0.0677,  0.1236,  0.0503, -0.0525,  0.2588,
          0.0390, -0.0355],
        [-0.0897,  0.0304, -0.0405, -0.1622, -0.0644,  0.0040,  0.2439,  0.1909,
          0.2963, -0.2859],
        [-0.2546,  0.2203,  0.1783, -0.0237,  0.2078, -0.1258,  0.1093,  0.2051,
          0.1070,  0.1059],
        [-0.1937, -0.2630, -0.1800,  0.1465, -0.0797,  0.0929,  0.1548, -0.2313,
         -0.2101,  0.2310],
        [-0.2429, -0.1604, -0.0051, -0.0184,  0.0735,  0.2035,  0.0530, -0.0518,

In [55]:
list(train_model.named_buffers())

[('slope', tensor(1.))]

In [56]:
train_model.k = 0

In [60]:
datetime.now().timestamp()

1701673864.723633

In [61]:
str(datetime.now())

'2023-12-04 16:11:43.263043'

In [65]:
def get_extra_state(module):
    return {"time_saved":datetime.now().timestamp() }

def set_extra_state(module, state):
    print("exta state ",)
    return 
train_model.set_extra_state = set_extra_state
train_model.get_extra_state = get_extra_state
train_model.state_dict()

OrderedDict([('p',
              tensor([[0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0.]])),
             ('slope', tensor(1.)),
             ('m1.net.weight',
              tensor([[-1.2175, -1.2462, -0.8599, -0.9323, -0.8764,  1.0503,  0.9475,  1.2588,
                        1.0389, -1.0355],
                      [-1.0897, -0.9696, -1.0405, -1.1622, -1.0644,  1.0040,  1.2439,  1.1909,
                        1.2962, -1.2859],
                      [-1.2546, -0.7797, -0.8217, -1.0237, -0.7922,  0.8742,  1.1093,  1.2051,
                        1.1070, -0.8941],
                      [-1.1937, -1.2630, -1.1800, -0.8535, -1.0797,  1.0929,  1.1548,  0.7687,
                        0.7899, -0.7690],
                      [-1.2429, -1.1604, -1.0051, -1.0184, -0.9265,  1.2035,  1.0530,  0.9482,
    

In [66]:
torch.save(train_model.state_dict(), "/mnt/dl/models/test_state/t_module.pth")

In [81]:
model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 3), nn.ReLU(), nn.Dropout(), nn.Linear(3, 1))

In [82]:
model

Sequential(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=3, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=3, out_features=1, bias=True)
)

In [83]:
model[0]

Linear(in_features=10, out_features=5, bias=True)

In [84]:
model[2]

ReLU()

In [85]:
model.insert(3, nn.Dropout())

Sequential(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=3, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Dropout(p=0.5, inplace=False)
  (5): Linear(in_features=3, out_features=1, bias=True)
)

In [86]:
model

Sequential(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=3, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Dropout(p=0.5, inplace=False)
  (5): Linear(in_features=3, out_features=1, bias=True)
)

In [87]:
model = nn.Sequential(OrderedDict({
    "net1": nn.Linear(10, 5), 
    "net2": nn.Linear(5, 3), 
    "net3":nn.ReLU(), 
    "net4": nn.Dropout(), 
    "net4": nn.Linear(3, 1)
}))

In [88]:
model

Sequential(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net3): ReLU()
  (net4): Linear(in_features=3, out_features=1, bias=True)
)

In [89]:
model[0]

Linear(in_features=10, out_features=5, bias=True)

In [90]:
model.append(nn.Softmax(1))

Sequential(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net3): ReLU()
  (net4): Linear(in_features=3, out_features=1, bias=True)
  (4): Softmax(dim=1)
)

In [92]:
model

Sequential(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net3): ReLU()
  (net4): Linear(in_features=3, out_features=1, bias=True)
  (4): Softmax(dim=1)
)

In [93]:
model_list = nn.ModuleList([])
for name, m in {
    "net1": nn.Linear(10, 5), 
    "net2": nn.Linear(5, 3), 
    "net3":nn.ReLU(), 
    "net4": nn.Dropout(), 
    "net4": nn.Linear(3, 1)
}.items():
    model_list.append(m)

In [94]:
model_list

ModuleList(
  (0): Linear(in_features=10, out_features=5, bias=True)
  (1): Linear(in_features=5, out_features=3, bias=True)
  (2): ReLU()
  (3): Linear(in_features=3, out_features=1, bias=True)
)

In [95]:
model_list[0]

Linear(in_features=10, out_features=5, bias=True)

In [96]:
model = nn.ModuleDict({
    "net1": nn.Linear(10, 5), 
    "net2": nn.Linear(5, 3), 
    "net3":nn.ReLU(), 
    "net4": nn.Dropout(), 
    "net4": nn.Linear(3, 1)
})

In [97]:
model

ModuleDict(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net3): ReLU()
  (net4): Linear(in_features=3, out_features=1, bias=True)
)

In [98]:
model['net1']

Linear(in_features=10, out_features=5, bias=True)

In [99]:
model["net5"] = nn.Sigmoid()

In [100]:
model

ModuleDict(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net3): ReLU()
  (net4): Linear(in_features=3, out_features=1, bias=True)
  (net5): Sigmoid()
)

In [101]:
model.pop("net3")

ReLU()

In [102]:
model

ModuleDict(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net4): Linear(in_features=3, out_features=1, bias=True)
  (net5): Sigmoid()
)

In [103]:
model.update({"net6": nn.Linear(1, 5)})

In [104]:
model

ModuleDict(
  (net1): Linear(in_features=10, out_features=5, bias=True)
  (net2): Linear(in_features=5, out_features=3, bias=True)
  (net4): Linear(in_features=3, out_features=1, bias=True)
  (net5): Sigmoid()
  (net6): Linear(in_features=1, out_features=5, bias=True)
)

In [105]:
model["net6"]

Linear(in_features=1, out_features=5, bias=True)

In [106]:
params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])

In [107]:
params

ParameterList(
    (0): Parameter containing: [torch.float32 of size 10x10]
    (1): Parameter containing: [torch.float32 of size 10x10]
    (2): Parameter containing: [torch.float32 of size 10x10]
    (3): Parameter containing: [torch.float32 of size 10x10]
    (4): Parameter containing: [torch.float32 of size 10x10]
    (5): Parameter containing: [torch.float32 of size 10x10]
    (6): Parameter containing: [torch.float32 of size 10x10]
    (7): Parameter containing: [torch.float32 of size 10x10]
    (8): Parameter containing: [torch.float32 of size 10x10]
    (9): Parameter containing: [torch.float32 of size 10x10]
)

In [108]:
params =  nn.ParameterDict({
                'left': nn.Parameter(torch.randn(5, 10)),
                'right': nn.Parameter(torch.randn(8, 10))
        })

In [109]:
params["left"]

Parameter containing:
tensor([[-0.6051,  1.6760,  1.1294,  0.7079, -0.1035, -1.1614, -1.3402, -1.8728,
         -1.4816, -1.1734],
        [ 0.7895, -0.5011,  0.8083,  1.2982, -0.0252,  1.2517,  1.2164,  0.3123,
         -0.6479, -0.9943],
        [-0.1160,  1.1690,  0.0049,  1.2685, -0.5895, -2.3923,  0.1723, -1.0662,
          0.4395, -0.6011],
        [-2.2042,  1.4215,  0.1101, -0.9445,  1.7215, -0.3624,  0.0371,  0.2175,
         -0.4151, -1.1725],
        [-0.1545, -0.8629, -0.2447, -0.6704,  1.3111,  0.6664,  0.8190,  1.7625,
         -0.7507,  0.7245]], requires_grad=True)

## Conv Layers

In [150]:
# conv1d
torch.manual_seed(0)
x = torch.randn(2, 4, 128) #bz, feat, timestep
conv_1d = nn.Conv1d(4, 32, 3)
y = conv_1d(x)

In [151]:
y.size()

torch.Size([2, 32, 126])

In [157]:
y[0, 0, 0]

tensor(0.2549, grad_fn=<SelectBackward0>)

In [152]:
conv_1d.weight.shape, conv_1d.bias.size()

(torch.Size([32, 4, 3]), torch.Size([32]))

In [154]:
x[0, :, :3].size()

torch.Size([4, 3])

In [160]:
(x[0, :, :3] * conv_1d.weight[0]).sum() + conv_1d.bias[0]

tensor(0.2549, grad_fn=<AddBackward0>)

In [161]:
yf = F.conv1d(x, conv_1d.weight, conv_1d.bias)

In [165]:
yf.eq(y).all()

tensor(True)

In [164]:
yf.size(), y.size()

(torch.Size([2, 32, 126]), torch.Size([2, 32, 126]))

In [167]:
x.size()

torch.Size([2, 4, 128])

In [168]:
x

tensor([[[-1.1258, -1.1524, -0.2506,  ...,  1.1648,  0.9234,  1.3873],
         [-0.8834, -0.4189, -0.8048,  ...,  0.1447,  1.9029,  0.3904],
         [-0.0394, -0.8015, -0.4955,  ...,  0.5541, -0.1817, -0.2345],
         [ 0.2942,  0.7973,  1.2642,  ..., -1.6989,  1.3094, -1.6613]],

        [[-0.5461, -0.6302, -0.6347,  ...,  0.2290,  1.2833, -1.3792],
         [ 0.5408, -0.9478,  0.2021,  ...,  1.6553,  0.5204, -0.2326],
         [ 0.4974,  0.2685,  1.4769,  ..., -1.3728,  1.6909, -0.4622],
         [ 0.2036, -1.0328,  1.1305,  ...,  0.5374,  1.0826, -1.7105]]])

In [173]:
F.pad(x, (3, 1))

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.9234,  1.3873,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  1.9029,  0.3904,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.1817, -0.2345,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  1.3094, -1.6613,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  1.2833, -1.3792,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.5204, -0.2326,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  1.6909, -0.4622,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  1.0826, -1.7105,  0.0000]]])

In [176]:
F.pad(x, (3, 1)).size()

torch.Size([2, 4, 132])

In [183]:
F.pad(x, (3, 1), value=1.)

tensor([[[ 1.0000,  1.0000,  1.0000,  ...,  0.9234,  1.3873,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.9029,  0.3904,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ..., -0.1817, -0.2345,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.3094, -1.6613,  1.0000]],

        [[ 1.0000,  1.0000,  1.0000,  ...,  1.2833, -1.3792,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  0.5204, -0.2326,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.6909, -0.4622,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0826, -1.7105,  1.0000]]])

In [189]:
F.pad(x, (3, 1, 2, 1), value=1.0), x.size(), F.pad(x, (3, 1, 2, 1), value=1.0).size()

(tensor([[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.9234,  1.3873,  1.0000],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ..., -0.1817, -0.2345,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.3094, -1.6613,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000]],
 
         [[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.2833, -1.3792,  1.0000],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  1.6909, -0.4622,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0826, -1.7105,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000]]]),
 torch.Size([2, 4, 128]),
 torch.Size([2, 7, 132]))

In [190]:
F.pad(x, (3, 1, 2, 1, 5, 2), value=1.0).size()

torch.Size([9, 7, 132])

In [198]:
z = torch.arange(4*3*5).view((4, 3, 5)).float() + 1
z

tensor([[[ 1.,  2.,  3.,  4.,  5.],
         [ 6.,  7.,  8.,  9., 10.],
         [11., 12., 13., 14., 15.]],

        [[16., 17., 18., 19., 20.],
         [21., 22., 23., 24., 25.],
         [26., 27., 28., 29., 30.]],

        [[31., 32., 33., 34., 35.],
         [36., 37., 38., 39., 40.],
         [41., 42., 43., 44., 45.]],

        [[46., 47., 48., 49., 50.],
         [51., 52., 53., 54., 55.],
         [56., 57., 58., 59., 60.]]])

In [200]:
F.pad(z, (2, 3), mode="constant")

tensor([[[ 0.,  0.,  1.,  2.,  3.,  4.,  5.,  0.,  0.,  0.],
         [ 0.,  0.,  6.,  7.,  8.,  9., 10.,  0.,  0.,  0.],
         [ 0.,  0., 11., 12., 13., 14., 15.,  0.,  0.,  0.]],

        [[ 0.,  0., 16., 17., 18., 19., 20.,  0.,  0.,  0.],
         [ 0.,  0., 21., 22., 23., 24., 25.,  0.,  0.,  0.],
         [ 0.,  0., 26., 27., 28., 29., 30.,  0.,  0.,  0.]],

        [[ 0.,  0., 31., 32., 33., 34., 35.,  0.,  0.,  0.],
         [ 0.,  0., 36., 37., 38., 39., 40.,  0.,  0.,  0.],
         [ 0.,  0., 41., 42., 43., 44., 45.,  0.,  0.,  0.]],

        [[ 0.,  0., 46., 47., 48., 49., 50.,  0.,  0.,  0.],
         [ 0.,  0., 51., 52., 53., 54., 55.,  0.,  0.,  0.],
         [ 0.,  0., 56., 57., 58., 59., 60.,  0.,  0.,  0.]]])

In [201]:
F.pad(z, (2, 3), mode="reflect")

tensor([[[ 3.,  2.,  1.,  2.,  3.,  4.,  5.,  4.,  3.,  2.],
         [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.,  7.],
         [13., 12., 11., 12., 13., 14., 15., 14., 13., 12.]],

        [[18., 17., 16., 17., 18., 19., 20., 19., 18., 17.],
         [23., 22., 21., 22., 23., 24., 25., 24., 23., 22.],
         [28., 27., 26., 27., 28., 29., 30., 29., 28., 27.]],

        [[33., 32., 31., 32., 33., 34., 35., 34., 33., 32.],
         [38., 37., 36., 37., 38., 39., 40., 39., 38., 37.],
         [43., 42., 41., 42., 43., 44., 45., 44., 43., 42.]],

        [[48., 47., 46., 47., 48., 49., 50., 49., 48., 47.],
         [53., 52., 51., 52., 53., 54., 55., 54., 53., 52.],
         [58., 57., 56., 57., 58., 59., 60., 59., 58., 57.]]])

In [206]:
F.pad(z, (2, 3, 2, 2), mode="reflect")

tensor([[[13., 12., 11., 12., 13., 14., 15., 14., 13., 12.],
         [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.,  7.],
         [ 3.,  2.,  1.,  2.,  3.,  4.,  5.,  4.,  3.,  2.],
         [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.,  7.],
         [13., 12., 11., 12., 13., 14., 15., 14., 13., 12.],
         [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.,  7.],
         [ 3.,  2.,  1.,  2.,  3.,  4.,  5.,  4.,  3.,  2.]],

        [[28., 27., 26., 27., 28., 29., 30., 29., 28., 27.],
         [23., 22., 21., 22., 23., 24., 25., 24., 23., 22.],
         [18., 17., 16., 17., 18., 19., 20., 19., 18., 17.],
         [23., 22., 21., 22., 23., 24., 25., 24., 23., 22.],
         [28., 27., 26., 27., 28., 29., 30., 29., 28., 27.],
         [23., 22., 21., 22., 23., 24., 25., 24., 23., 22.],
         [18., 17., 16., 17., 18., 19., 20., 19., 18., 17.]],

        [[43., 42., 41., 42., 43., 44., 45., 44., 43., 42.],
         [38., 37., 36., 37., 38., 39., 40., 39., 38., 37.],
         [33., 32., 

In [207]:
F.pad(z, (2, 3), mode="replicate")

tensor([[[ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.,  5.],
         [ 6.,  6.,  6.,  7.,  8.,  9., 10., 10., 10., 10.],
         [11., 11., 11., 12., 13., 14., 15., 15., 15., 15.]],

        [[16., 16., 16., 17., 18., 19., 20., 20., 20., 20.],
         [21., 21., 21., 22., 23., 24., 25., 25., 25., 25.],
         [26., 26., 26., 27., 28., 29., 30., 30., 30., 30.]],

        [[31., 31., 31., 32., 33., 34., 35., 35., 35., 35.],
         [36., 36., 36., 37., 38., 39., 40., 40., 40., 40.],
         [41., 41., 41., 42., 43., 44., 45., 45., 45., 45.]],

        [[46., 46., 46., 47., 48., 49., 50., 50., 50., 50.],
         [51., 51., 51., 52., 53., 54., 55., 55., 55., 55.],
         [56., 56., 56., 57., 58., 59., 60., 60., 60., 60.]]])

In [208]:
F.pad(z, (2, 3, 1, 2), mode="replicate")

tensor([[[ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.,  5.],
         [ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.,  5.],
         [ 6.,  6.,  6.,  7.,  8.,  9., 10., 10., 10., 10.],
         [11., 11., 11., 12., 13., 14., 15., 15., 15., 15.],
         [11., 11., 11., 12., 13., 14., 15., 15., 15., 15.],
         [11., 11., 11., 12., 13., 14., 15., 15., 15., 15.]],

        [[16., 16., 16., 17., 18., 19., 20., 20., 20., 20.],
         [16., 16., 16., 17., 18., 19., 20., 20., 20., 20.],
         [21., 21., 21., 22., 23., 24., 25., 25., 25., 25.],
         [26., 26., 26., 27., 28., 29., 30., 30., 30., 30.],
         [26., 26., 26., 27., 28., 29., 30., 30., 30., 30.],
         [26., 26., 26., 27., 28., 29., 30., 30., 30., 30.]],

        [[31., 31., 31., 32., 33., 34., 35., 35., 35., 35.],
         [31., 31., 31., 32., 33., 34., 35., 35., 35., 35.],
         [36., 36., 36., 37., 38., 39., 40., 40., 40., 40.],
         [41., 41., 41., 42., 43., 44., 45., 45., 45., 45.],
         [41., 41., 

In [212]:
F.pad(z, (2, 5), mode="circular")

tensor([[[ 4.,  5.,  1.,  2.,  3.,  4.,  5.,  1.,  2.,  3.,  4.,  5.],
         [ 9., 10.,  6.,  7.,  8.,  9., 10.,  6.,  7.,  8.,  9., 10.],
         [14., 15., 11., 12., 13., 14., 15., 11., 12., 13., 14., 15.]],

        [[19., 20., 16., 17., 18., 19., 20., 16., 17., 18., 19., 20.],
         [24., 25., 21., 22., 23., 24., 25., 21., 22., 23., 24., 25.],
         [29., 30., 26., 27., 28., 29., 30., 26., 27., 28., 29., 30.]],

        [[34., 35., 31., 32., 33., 34., 35., 31., 32., 33., 34., 35.],
         [39., 40., 36., 37., 38., 39., 40., 36., 37., 38., 39., 40.],
         [44., 45., 41., 42., 43., 44., 45., 41., 42., 43., 44., 45.]],

        [[49., 50., 46., 47., 48., 49., 50., 46., 47., 48., 49., 50.],
         [54., 55., 51., 52., 53., 54., 55., 51., 52., 53., 54., 55.],
         [59., 60., 56., 57., 58., 59., 60., 56., 57., 58., 59., 60.]]])

In [215]:
# Error
# F.pad(z, (2, 3, 1, 3), mode="circular")

In [217]:
# conv1d with dilation
torch.manual_seed(0)
x = torch.randn(2, 4, 128) #bz, feat, timestep
conv_1d = nn.Conv1d(4, 32, 3, dilation=2)
y = conv_1d(x)

In [218]:
y.size()

torch.Size([2, 32, 124])

In [219]:
conv_1d.weight.size()

torch.Size([32, 4, 3])

In [220]:
x[0, :, 0:6:2].size()

torch.Size([4, 3])

In [225]:
(x[0, :, 0:6:2] * conv_1d.weight[0]).sum() + conv_1d.bias[0]

tensor(0.0894, grad_fn=<AddBackward0>)

In [223]:
y[0, 0, 0]

tensor(0.0894, grad_fn=<SelectBackward0>)

In [230]:
# conv1d with group
torch.manual_seed(0)
x = torch.randn(2, 4, 128) #bz, feat, timestep
conv_1d = nn.Conv1d(4, 32, 3, groups=2)
y = conv_1d(x)

In [231]:
y.size()

torch.Size([2, 32, 126])

In [232]:
conv_1d.weight.size()

torch.Size([32, 2, 3])

In [264]:
y[0, 0, 0]

tensor(0.3963, grad_fn=<SelectBackward0>)

In [265]:
(x[0, 0:2, 0:3] * conv_1d.weight[0, ]).sum() + conv_1d.bias[0].sum()

tensor(0.3963, grad_fn=<AddBackward0>)

In [281]:
# conv2d
x = torch.randn(4, 3, 64, 64)
conv_2d = nn.Conv2d(3, 32, 3)
y = conv_2d(x)

In [282]:
y.size()

torch.Size([4, 32, 62, 62])

In [283]:
# conv2d padding mode
x = torch.randn(4, 3, 64, 64)
conv_2d = nn.Conv2d(3, 32, 3, padding="same")
y = conv_2d(x)

In [284]:
y.size()

torch.Size([4, 32, 64, 64])

In [285]:
# conv2d padding mode
x = torch.randn(4, 3, 64, 64)
conv_2d = nn.Conv2d(3, 32, 3, padding="valid")
y = conv_2d(x)

In [286]:
y.size()

torch.Size([4, 32, 62, 62])

In [292]:
# conv3d padding mode
x = torch.randn(4, 16, 3, 64, 64)
conv_3d = nn.Conv3d(16, 32, 3)
y = conv_3d(x)

In [293]:
y.size()

torch.Size([4, 32, 1, 62, 62])

In [295]:
# conv1d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64)
conv_1d_t = nn.ConvTranspose1d(32, 16, kernel_size=3)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 66])

In [296]:
conv_1d_t.weight.size()

torch.Size([32, 16, 3])

In [297]:
# conv1d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64)
conv_1d_t = nn.ConvTranspose1d(32, 16, kernel_size=3, padding=4)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 58])

In [301]:
# conv1d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64)
conv_1d_t = nn.ConvTranspose1d(32, 16, kernel_size=3, padding=1)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 64])

In [309]:
# conv1d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64)
conv_1d_t = nn.ConvTranspose1d(32, 16, kernel_size=5, padding=0)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 68])

In [310]:
# conv1d 
torch.manual_seed(0)
x = torch.randn(4, 32, 68)
conv_1d_t = nn.Conv1d(32, 16, kernel_size=5, padding=0)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 64])

In [311]:
# conv1d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64)
conv_1d_t = nn.ConvTranspose1d(32, 16, kernel_size=5, padding=5)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 58])

In [312]:
# conv1d 
torch.manual_seed(0)
x = torch.randn(4, 32, 58)
conv_1d_t = nn.Conv1d(32, 16, kernel_size=5, padding=5)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 64])

In [314]:
# conv2d transpose
torch.manual_seed(0)
x = torch.randn(4, 32, 64, 64)
conv_1d_t = nn.ConvTranspose2d(32, 16, kernel_size=3)
y = conv_1d_t(x)
y.size()

torch.Size([4, 16, 66, 66])

In [337]:
# conv2d transpose
torch.manual_seed(0)
x = torch.randn(4, 3, 32, 32)
weight = torch.randn(10, 3, 3, 3)
unfold = nn.Unfold(3, ) # Flatten the image patch to convolve and build blocks
y = unfold(x)

In [338]:
unfold

Unfold(kernel_size=3, dilation=1, padding=0, stride=1)

In [339]:
y.size()

torch.Size([4, 27, 900])

In [353]:
y[0]

tensor([[-1.1258, -1.1524, -0.2506,  ..., -0.2407, -0.6251,  0.8161],
        [-1.1524, -0.2506, -0.4339,  ..., -0.6251,  0.8161, -0.5711],
        [-0.2506, -0.4339,  0.8487,  ...,  0.8161, -0.5711, -0.1195],
        ...,
        [-1.1870, -0.8221,  0.6051,  ...,  1.3223, -2.0006, -0.6380],
        [-0.8221,  0.6051, -0.9905,  ..., -2.0006, -0.6380, -1.1714],
        [ 0.6051, -0.9905, -0.2534,  ..., -0.6380, -1.1714, -0.8415]])

In [342]:
weight.view(10, -1).size()

torch.Size([10, 27])

In [344]:
out = torch.matmul(weight.view(10, -1), y)

In [345]:
out.size()

torch.Size([4, 10, 900])

In [346]:
conv2d = nn.Conv2d(3, 10, 3, bias=False)
conv2d.weight.data = weight

In [348]:
conv2d(x).size()

torch.Size([4, 10, 30, 30])

In [351]:
(conv2d(x).view(4, 10, -1) - out).max()

tensor(5.7220e-06, grad_fn=<MaxBackward1>)

In [358]:
(nn.Fold((32, 32), 3)(y) ).size()

torch.Size([4, 3, 32, 32])

In [359]:
# pooling
x = torch.randn(4, 8, 27)
pool = nn.MaxPool1d(3)
y = pool(x)
y.size()

torch.Size([4, 8, 9])

In [360]:
# pooling
x = torch.randn(4, 8, 27)
pool = nn.MaxPool1d(3, stride=1)
y = pool(x)
y.size()

torch.Size([4, 8, 25])

In [362]:
# pooling
x = torch.randn(4, 8, 27)
pool = nn.AvgPool1d(3,)
y = pool(x)
y.size()

torch.Size([4, 8, 9])

In [373]:
# pooling
x = torch.arange(100).view(1, 1, 10, 10).float()
pool = nn.AdaptiveMaxPool2d(output_size=6)
y = pool(x)
y.size()

torch.Size([1, 1, 6, 6])

In [374]:
y

tensor([[[[11., 13., 14., 16., 18., 19.],
          [31., 33., 34., 36., 38., 39.],
          [41., 43., 44., 46., 48., 49.],
          [61., 63., 64., 66., 68., 69.],
          [81., 83., 84., 86., 88., 89.],
          [91., 93., 94., 96., 98., 99.]]]])

## Padding

In [378]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ReflectionPad2d(2)
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[13., 12., 11., 12., 13., 14., 15., 14., 13.],
           [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.],
           [ 3.,  2.,  1.,  2.,  3.,  4.,  5.,  4.,  3.],
           [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.],
           [13., 12., 11., 12., 13., 14., 15., 14., 13.],
           [18., 17., 16., 17., 18., 19., 20., 19., 18.],
           [13., 12., 11., 12., 13., 14., 15., 14., 13.],
           [ 8.,  7.,  6.,  7.,  8.,  9., 10.,  9.,  8.]],
 
          [[33., 32., 31., 32., 33., 34., 35., 34., 33.],
         

In [380]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ReflectionPad2d((0, 2, 0, 0))
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 1.,  2.,  3.,  4.,  5.,  4.,  3.],
           [ 6.,  7.,  8.,  9., 10.,  9.,  8.],
           [11., 12., 13., 14., 15., 14., 13.],
           [16., 17., 18., 19., 20., 19., 18.]],
 
          [[21., 22., 23., 24., 25., 24., 23.],
           [26., 27., 28., 29., 30., 29., 28.],
           [31., 32., 33., 34., 35., 34., 33.],
           [36., 37., 38., 39., 40., 39., 38.]],
 
          [[41., 42., 43., 44., 45., 44., 43.],
           [46., 47., 48., 49., 50., 49., 48.],
           [51., 52., 53., 54., 55., 54., 53.],


In [383]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ReflectionPad2d((1, 2, 0, 0))
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 2.,  1.,  2.,  3.,  4.,  5.,  4.,  3.],
           [ 7.,  6.,  7.,  8.,  9., 10.,  9.,  8.],
           [12., 11., 12., 13., 14., 15., 14., 13.],
           [17., 16., 17., 18., 19., 20., 19., 18.]],
 
          [[22., 21., 22., 23., 24., 25., 24., 23.],
           [27., 26., 27., 28., 29., 30., 29., 28.],
           [32., 31., 32., 33., 34., 35., 34., 33.],
           [37., 36., 37., 38., 39., 40., 39., 38.]],
 
          [[42., 41., 42., 43., 44., 45., 44., 43.],
           [47., 46., 47., 48., 49., 50., 49., 48.]

In [385]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ReplicationPad2d(2)
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.],
           [ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.],
           [ 1.,  1.,  1.,  2.,  3.,  4.,  5.,  5.,  5.],
           [ 6.,  6.,  6.,  7.,  8.,  9., 10., 10., 10.],
           [11., 11., 11., 12., 13., 14., 15., 15., 15.],
           [16., 16., 16., 17., 18., 19., 20., 20., 20.],
           [16., 16., 16., 17., 18., 19., 20., 20., 20.],
           [16., 16., 16., 17., 18., 19., 20., 20., 20.]],
 
          [[21., 21., 21., 22., 23., 24., 25., 25., 25.],
         

In [387]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ReplicationPad2d((2, 0, 0, 0))
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 1.,  1.,  1.,  2.,  3.,  4.,  5.],
           [ 6.,  6.,  6.,  7.,  8.,  9., 10.],
           [11., 11., 11., 12., 13., 14., 15.],
           [16., 16., 16., 17., 18., 19., 20.]],
 
          [[21., 21., 21., 22., 23., 24., 25.],
           [26., 26., 26., 27., 28., 29., 30.],
           [31., 31., 31., 32., 33., 34., 35.],
           [36., 36., 36., 37., 38., 39., 40.]],
 
          [[41., 41., 41., 42., 43., 44., 45.],
           [46., 46., 46., 47., 48., 49., 50.],
           [51., 51., 51., 52., 53., 54., 55.],


In [389]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ZeroPad2d(2)
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
           [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
           [ 0.,  0.,  1.,  2.,  3.,  4.,  5.,  0.,  0.],
           [ 0.,  0.,  6.,  7.,  8.,  9., 10.,  0.,  0.],
           [ 0.,  0., 11., 12., 13., 14., 15.,  0.,  0.],
           [ 0.,  0., 16., 17., 18., 19., 20.,  0.,  0.],
           [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
           [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]],
 
          [[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         

In [388]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ZeroPad2d((2, 0, 0, 0))
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[ 0.,  0.,  1.,  2.,  3.,  4.,  5.],
           [ 0.,  0.,  6.,  7.,  8.,  9., 10.],
           [ 0.,  0., 11., 12., 13., 14., 15.],
           [ 0.,  0., 16., 17., 18., 19., 20.]],
 
          [[ 0.,  0., 21., 22., 23., 24., 25.],
           [ 0.,  0., 26., 27., 28., 29., 30.],
           [ 0.,  0., 31., 32., 33., 34., 35.],
           [ 0.,  0., 36., 37., 38., 39., 40.]],
 
          [[ 0.,  0., 41., 42., 43., 44., 45.],
           [ 0.,  0., 46., 47., 48., 49., 50.],
           [ 0.,  0., 51., 52., 53., 54., 55.],


In [391]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ConstantPad2d(2, value=-100)
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[-100., -100., -100., -100., -100., -100., -100., -100., -100.],
           [-100., -100., -100., -100., -100., -100., -100., -100., -100.],
           [-100., -100.,    1.,    2.,    3.,    4.,    5., -100., -100.],
           [-100., -100.,    6.,    7.,    8.,    9.,   10., -100., -100.],
           [-100., -100.,   11.,   12.,   13.,   14.,   15., -100., -100.],
           [-100., -100.,   16.,   17.,   18.,   19.,   20., -100., -100.],
           [-100., -100., -100., -100., -100., -100., -100., -100., -100.],
  

In [393]:
# reflection 1d
x = torch.arange(3*4*5, dtype=torch.float).reshape(1, 3, 4, 5) + 1
pad2d = nn.ConstantPad2d((1, 0, 1, 0), value=-100)#left, right, top, bottom
y = pad2d(x)
x, y, x.size(), y.size()

(tensor([[[[ 1.,  2.,  3.,  4.,  5.],
           [ 6.,  7.,  8.,  9., 10.],
           [11., 12., 13., 14., 15.],
           [16., 17., 18., 19., 20.]],
 
          [[21., 22., 23., 24., 25.],
           [26., 27., 28., 29., 30.],
           [31., 32., 33., 34., 35.],
           [36., 37., 38., 39., 40.]],
 
          [[41., 42., 43., 44., 45.],
           [46., 47., 48., 49., 50.],
           [51., 52., 53., 54., 55.],
           [56., 57., 58., 59., 60.]]]]),
 tensor([[[[-100., -100., -100., -100., -100., -100.],
           [-100.,    1.,    2.,    3.,    4.,    5.],
           [-100.,    6.,    7.,    8.,    9.,   10.],
           [-100.,   11.,   12.,   13.,   14.,   15.],
           [-100.,   16.,   17.,   18.,   19.,   20.]],
 
          [[-100., -100., -100., -100., -100., -100.],
           [-100.,   21.,   22.,   23.,   24.,   25.],
           [-100.,   26.,   27.,   28.,   29.,   30.],
           [-100.,   31.,   32.,   33.,   34.,   35.],
           [-100.,   36.,   37.,   3

## Function Activation

In [396]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-5, 5)
y = nn.ReLU()(x) # max(x, 0)
z = F.relu(x)
x, y, z

(tensor([[-0.0374,  2.6822, -4.1152, -3.6797, -1.9258],
         [ 1.3408, -0.0991,  3.9644, -0.4437,  1.3231],
         [-1.5111, -0.9828, -4.7767, -3.3114, -2.0611],
         [ 0.1852,  1.9767,  3.0001, -3.3897, -2.1773]]),
 tensor([[0.0000, 2.6822, 0.0000, 0.0000, 0.0000],
         [1.3408, 0.0000, 3.9644, 0.0000, 1.3231],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1852, 1.9767, 3.0001, 0.0000, 0.0000]]),
 tensor([[0.0000, 2.6822, 0.0000, 0.0000, 0.0000],
         [1.3408, 0.0000, 3.9644, 0.0000, 1.3231],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1852, 1.9767, 3.0001, 0.0000, 0.0000]]))

In [402]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 10)
y = nn.LeakyReLU(negative_slope=0.1)(x)  # max(0,x) + negative_slope ∗ min(0,x)
z = F.leaky_relu(x, negative_slope=0.1)
x, y, z

(tensor([[-0.0749,  5.3644, -8.2305, -7.3594, -3.8515],
         [ 2.6816, -0.1981,  7.9289, -0.8874,  2.6461],
         [-3.0221, -1.9657, -9.5535, -6.6228, -4.1222],
         [ 0.3704,  3.9534,  6.0002, -6.7794, -4.3546]]),
 tensor([[-7.4868e-03,  5.3644e+00, -8.2305e-01, -7.3594e-01, -3.8515e-01],
         [ 2.6816e+00, -1.9813e-02,  7.9289e+00, -8.8744e-02,  2.6461e+00],
         [-3.0221e-01, -1.9657e-01, -9.5535e-01, -6.6228e-01, -4.1222e-01],
         [ 3.7044e-01,  3.9534e+00,  6.0002e+00, -6.7794e-01, -4.3546e-01]]),
 tensor([[-7.4868e-03,  5.3644e+00, -8.2305e-01, -7.3594e-01, -3.8515e-01],
         [ 2.6816e+00, -1.9813e-02,  7.9289e+00, -8.8744e-02,  2.6461e+00],
         [-3.0221e-01, -1.9657e-01, -9.5535e-01, -6.6228e-01, -4.1222e-01],
         [ 3.7044e-01,  3.9534e+00,  6.0002e+00, -6.7794e-01, -4.3546e-01]]))

In [397]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-5, 5)
y = nn.Sigmoid()(x)
z = F.sigmoid(x)
x, y, z

(tensor([[-0.0374,  2.6822, -4.1152, -3.6797, -1.9258],
         [ 1.3408, -0.0991,  3.9644, -0.4437,  1.3231],
         [-1.5111, -0.9828, -4.7767, -3.3114, -2.0611],
         [ 0.1852,  1.9767,  3.0001, -3.3897, -2.1773]]),
 tensor([[0.4906, 0.9360, 0.0161, 0.0246, 0.1272],
         [0.7926, 0.4753, 0.9814, 0.3909, 0.7897],
         [0.1808, 0.2723, 0.0084, 0.0352, 0.1129],
         [0.5462, 0.8783, 0.9526, 0.0326, 0.1018]]),
 tensor([[0.4906, 0.9360, 0.0161, 0.0246, 0.1272],
         [0.7926, 0.4753, 0.9814, 0.3909, 0.7897],
         [0.1808, 0.2723, 0.0084, 0.0352, 0.1129],
         [0.5462, 0.8783, 0.9526, 0.0326, 0.1018]]))

In [399]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-5, 5)
y = nn.Tanh()(x)
z = F.tanh(x)
x, y, z

(tensor([[-0.0374,  2.6822, -4.1152, -3.6797, -1.9258],
         [ 1.3408, -0.0991,  3.9644, -0.4437,  1.3231],
         [-1.5111, -0.9828, -4.7767, -3.3114, -2.0611],
         [ 0.1852,  1.9767,  3.0001, -3.3897, -2.1773]]),
 tensor([[-0.0374,  0.9907, -0.9995, -0.9987, -0.9584],
         [ 0.8719, -0.0987,  0.9993, -0.4167,  0.8675],
         [-0.9071, -0.7543, -0.9999, -0.9973, -0.9681],
         [ 0.1831,  0.9623,  0.9951, -0.9977, -0.9746]]),
 tensor([[-0.0374,  0.9907, -0.9995, -0.9987, -0.9584],
         [ 0.8719, -0.0987,  0.9993, -0.4167,  0.8675],
         [-0.9071, -0.7543, -0.9999, -0.9973, -0.9681],
         [ 0.1831,  0.9623,  0.9951, -0.9977, -0.9746]]))

In [400]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-5, 5)
y = nn.LogSigmoid()(x)
z = F.logsigmoid(x)
x, y, z

(tensor([[-0.0374,  2.6822, -4.1152, -3.6797, -1.9258],
         [ 1.3408, -0.0991,  3.9644, -0.4437,  1.3231],
         [-1.5111, -0.9828, -4.7767, -3.3114, -2.0611],
         [ 0.1852,  1.9767,  3.0001, -3.3897, -2.1773]]),
 tensor([[-0.7120, -0.0662, -4.1314, -3.7046, -2.0618],
         [-0.2324, -0.7439, -0.0188, -0.9394, -0.2361],
         [-1.7105, -1.3007, -4.7851, -3.3472, -2.1810],
         [-0.6048, -0.1297, -0.0486, -3.4229, -2.2847]]),
 tensor([[-0.7120, -0.0662, -4.1314, -3.7046, -2.0618],
         [-0.2324, -0.7439, -0.0188, -0.9394, -0.2361],
         [-1.7105, -1.3007, -4.7851, -3.3472, -2.1810],
         [-0.6048, -0.1297, -0.0486, -3.4229, -2.2847]]))

In [398]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-5, 5)
y = nn.ELU()(x) # x or a * (exp(x) - 1)
z = F.elu(x)
x, y, z

(tensor([[-0.0374,  2.6822, -4.1152, -3.6797, -1.9258],
         [ 1.3408, -0.0991,  3.9644, -0.4437,  1.3231],
         [-1.5111, -0.9828, -4.7767, -3.3114, -2.0611],
         [ 0.1852,  1.9767,  3.0001, -3.3897, -2.1773]]),
 tensor([[-0.0367,  2.6822, -0.9837, -0.9748, -0.8542],
         [ 1.3408, -0.0943,  3.9644, -0.3584,  1.3231],
         [-0.7793, -0.6257, -0.9916, -0.9635, -0.8727],
         [ 0.1852,  1.9767,  3.0001, -0.9663, -0.8867]]),
 tensor([[-0.0367,  2.6822, -0.9837, -0.9748, -0.8542],
         [ 1.3408, -0.0943,  3.9644, -0.3584,  1.3231],
         [-0.7793, -0.6257, -0.9916, -0.9635, -0.8727],
         [ 0.1852,  1.9767,  3.0001, -0.9663, -0.8867]]))

In [404]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.ReLU6()(x) #  min(max(0,x), 6)
z = F.relu6(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[4.8877, 6.0000, 0.0000, 0.0000, 0.0000],
         [6.0000, 4.7028, 6.0000, 3.6688, 6.0000],
         [0.4668, 2.0515, 0.0000, 0.0000, 0.0000],
         [5.5557, 6.0000, 6.0000, 0.0000, 0.0000]]),
 tensor([[4.8877, 6.0000, 0.0000, 0.0000, 0.0000],
         [6.0000, 4.7028, 6.0000, 3.6688, 6.0000],
         [0.4668, 2.0515, 0.0000, 0.0000, 0.0000],
         [5.5557, 6.0000, 6.0000, 0.0000, 0.0000]]))

In [410]:
torch.manual_seed(0)
x = torch.empty(4).uniform_(-10, 20)
prelu = nn.PReLU(1) # PReLU(x) = max(0, x) + a ∗ min(0, x)
y = prelu(x) 
z = F.prelu(x, prelu.weight)
x, y, z, prelu.weight

(tensor([ 4.8877, 13.0467, -7.3457, -6.0391]),
 tensor([ 4.8877, 13.0467, -1.8364, -1.5098], grad_fn=<PreluKernelBackward0>),
 tensor([ 4.8877, 13.0467, -1.8364, -1.5098], grad_fn=<PreluKernelBackward0>),
 Parameter containing:
 tensor([0.2500], requires_grad=True))

In [408]:
-7.3457 * 0.25

-1.836425

In [423]:
torch.manual_seed(0)
x = torch.empty(4, 3, 2).uniform_(-10, 20)
prelu = nn.PReLU(3)  # 
prelu.weight.data *= torch.randn(3).abs() 
y = prelu(x) 
z = F.prelu(x, prelu.weight)
x, y, z, prelu.weight

(tensor([[[ 4.8877, 13.0467],
          [-7.3457, -6.0391],
          [-0.7773,  9.0224]],
 
         [[ 4.7028, 16.8933],
          [ 3.6688,  8.9692],
          [ 0.4668,  2.0515]],
 
         [[-9.3302, -4.9342],
          [-1.1833,  5.5557],
          [10.9300, 14.0003]],
 
         [[-5.1691, -1.5319],
          [10.4483, 17.4558],
          [ 1.9130, 16.2247]]]),
 tensor([[[ 4.8877, 13.0467],
          [-2.0212, -1.6617],
          [-0.2082,  9.0224]],
 
         [[ 4.7028, 16.8933],
          [ 3.6688,  8.9692],
          [ 0.4668,  2.0515]],
 
         [[-1.9982, -1.0568],
          [-0.3256,  5.5557],
          [10.9300, 14.0003]],
 
         [[-1.1071, -0.3281],
          [10.4483, 17.4558],
          [ 1.9130, 16.2247]]], grad_fn=<PreluKernelBackward0>),
 tensor([[[ 4.8877, 13.0467],
          [-2.0212, -1.6617],
          [-0.2082,  9.0224]],
 
         [[ 4.7028, 16.8933],
          [ 3.6688,  8.9692],
          [ 0.4668,  2.0515]],
 
         [[-1.9982, -1.0568],
        

In [424]:
-4.9342 * 0.2678

-1.3213787599999998

In [426]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.Hardswish()(x)
z = F.hardswish(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[ 4.8877, 13.0467, -0.0000, -0.0000, -0.2880],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.2697,  1.7272, -0.0000, -0.0000, -0.3583],
         [ 5.5557, 10.9300, 14.0003, -0.0000, -0.3748]]),
 tensor([[ 4.8877, 13.0467, -0.0000, -0.0000, -0.2880],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.2697,  1.7272, -0.0000, -0.0000, -0.3583],
         [ 5.5557, 10.9300, 14.0003, -0.0000, -0.3748]]))

In [427]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.SELU()(x) # =scale∗(max(0,x)+min(0,α∗(exp(x)−1))) => scale * ELU(x)
z = F.selu(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[ 5.1355, 13.7081, -1.7570, -1.7539, -0.9500],
         [ 9.4798,  4.9412, 17.7499,  3.8549,  9.4239],
         [ 0.4905,  2.1555, -1.7579, -1.7454, -1.2197],
         [ 5.8373, 11.4842, 14.7102, -1.7481, -1.3781]]),
 tensor([[ 5.1355, 13.7081, -1.7570, -1.7539, -0.9500],
         [ 9.4798,  4.9412, 17.7499,  3.8549,  9.4239],
         [ 0.4905,  2.1555, -1.7579, -1.7454, -1.2197],
         [ 5.8373, 11.4842, 14.7102, -1.7481, -1.3781]]))

In [428]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.CELU()(x) # =max(0,x)+min(0,α∗(exp(x/α)−1))
z = F.celu(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[ 4.8877, 13.0467, -0.9994, -0.9976, -0.5404],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -0.9999, -0.9928, -0.6937],
         [ 5.5557, 10.9300, 14.0003, -0.9943, -0.7839]]),
 tensor([[ 4.8877, 13.0467, -0.9994, -0.9976, -0.5404],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -0.9999, -0.9928, -0.6937],
         [ 5.5557, 10.9300, 14.0003, -0.9943, -0.7839]]))

In [429]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.GELU()(x) # GELU(x)=x∗Φ(x)
z = F.gelu(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[ 4.8877e+00,  1.3047e+01,  0.0000e+00,  0.0000e+00, -1.6983e-01],
         [ 9.0224e+00,  4.7028e+00,  1.6893e+01,  3.6684e+00,  8.9692e+00],
         [ 3.1728e-01,  2.0103e+00,  0.0000e+00, -2.0587e-06, -1.4003e-01],
         [ 5.5557e+00,  1.0930e+01,  1.4000e+01, -6.1621e-07, -9.6157e-02]]),
 tensor([[ 4.8877e+00,  1.3047e+01,  0.0000e+00,  0.0000e+00, -1.6983e-01],
         [ 9.0224e+00,  4.7028e+00,  1.6893e+01,  3.6684e+00,  8.9692e+00],
         [ 3.1728e-01,  2.0103e+00,  0.0000e+00, -2.0587e-06, -1.4003e-01],
         [ 5.5557e+00,  1.0930e+01,  1.4000e+01, -6.1621e-07, -9.6157e-02]]))

In [430]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.SiLU()(x) # SILU(x)=x∗σ(x)
z = F.silu(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[ 4.8511e+00,  1.3047e+01, -4.7377e-03, -1.4361e-02, -2.4478e-01],
         [ 9.0213e+00,  4.6605e+00,  1.6893e+01,  3.5776e+00,  8.9680e+00],
         [ 2.8691e-01,  1.8179e+00, -8.2754e-04, -3.5253e-02, -2.7744e-01],
         [ 5.5343e+00,  1.0930e+01,  1.4000e+01, -2.9244e-02, -2.7224e-01]]),
 tensor([[ 4.8511e+00,  1.3047e+01, -4.7377e-03, -1.4361e-02, -2.4478e-01],
         [ 9.0213e+00,  4.6605e+00,  1.6893e+01,  3.5776e+00,  8.9680e+00],
         [ 2.8691e-01,  1.8179e+00, -8.2754e-04, -3.5253e-02, -2.7744e-01],
         [ 5.5343e+00,  1.0930e+01,  1.4000e+01, -2.9244e-02, -2.7224e-01]]))

In [431]:
torch.manual_seed(0)
x = torch.empty(4, 5).uniform_(-10, 20)
y = nn.Softplus()(x) # Softplus(x) = (1/β) ∗log(1+exp(β∗x))
z = F.softplus(x)
x, y, z

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
         [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
         [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]]),
 tensor([[4.8952e+00, 1.3047e+01, 6.4517e-04, 2.3809e-03, 3.7819e-01],
         [9.0225e+00, 4.7118e+00, 1.6893e+01, 3.6940e+00, 8.9693e+00],
         [9.5354e-01, 2.1724e+00, 8.8698e-05, 7.1702e-03, 2.6716e-01],
         [5.5595e+00, 1.0930e+01, 1.4000e+01, 5.6735e-03, 1.9566e-01]]),
 tensor([[4.8952e+00, 1.3047e+01, 6.4517e-04, 2.3809e-03, 3.7819e-01],
         [9.0225e+00, 4.7118e+00, 1.6893e+01, 3.6940e+00, 8.9693e+00],
         [9.5354e-01, 2.1724e+00, 8.8698e-05, 7.1702e-03, 2.6716e-01],
         [5.5595e+00, 1.0930e+01, 1.4000e+01, 5.6735e-03, 1.9566e-01]]))

In [437]:
torch.manual_seed(0)
x = torch.empty(2, 4, 5).uniform_(-10, 20)
y = nn.Softmax(-1)(x)
z = F.softmax(x, dim=-1)
x, y, z, y.sum(-1)

(tensor([[[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
          [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
          [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
          [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]],
 
         [[10.4483, 17.4558,  1.9130, 16.2247,  2.5822],
          [ 6.5872, 18.5821, -8.9151, -4.4431,  1.2025],
          [-0.8470, 17.9600, -4.7227, -1.9050, -5.4796],
          [-9.0484, -3.7561, 17.8940, 11.6933, 12.2701]]]),
 tensor([[[2.8608e-04, 9.9971e-01, 1.3919e-09, 5.1410e-09, 9.9129e-07],
          [3.8137e-04, 5.0745e-06, 9.9925e-01, 1.8045e-06, 3.6162e-04],
          [1.6462e-01, 8.0301e-01, 9.1557e-06, 7.4276e-04, 3.1611e-02],
          [2.0546e-04, 4.4339e-02, 9.5545e-01, 4.5188e-09, 1.7164e-07]],
 
         [[7.0000e-04, 7.7348e-01, 1.3749e-07, 2.2582e-01, 2.6850e-07],
          [6.1754e-06, 9.9999e-01, 1.1432e-12, 1.0006e-10, 2.8322e-08],
          [6.7954e-09, 1.0000e+00, 1.4094e-10, 2.3591e-09, 6.6115e-11],
          [1.9798e-12, 3.9359e-

In [438]:
torch.manual_seed(0)
x = torch.empty(2, 4, 5).uniform_(-10, 20)
y = nn.Softmax(-2)(x)
z = F.softmax(x, dim=-2)
x, y, z, y.sum(-2)

(tensor([[[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
          [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692],
          [ 0.4668,  2.0515, -9.3302, -4.9342, -1.1833],
          [ 5.5557, 10.9300, 14.0003, -5.1691, -1.5319]],
 
         [[10.4483, 17.4558,  1.9130, 16.2247,  2.5822],
          [ 6.5872, 18.5821, -8.9151, -4.4431,  1.2025],
          [-0.8470, 17.9600, -4.7227, -1.9050, -5.4796],
          [-9.0484, -3.7561, 17.8940, 11.6933, 12.2701]]]),
 tensor([[[1.5283e-02, 8.9231e-01, 2.8165e-11, 6.0776e-05, 5.8492e-05],
          [9.5473e-01, 2.1224e-04, 9.4750e-01, 9.9961e-01, 9.9987e-01],
          [1.8376e-04, 1.4976e-05, 3.8710e-12, 1.8347e-04, 3.8972e-05],
          [2.9806e-02, 1.0747e-01, 5.2501e-02, 1.4506e-04, 2.7502e-05]],
 
         [[9.7938e-01, 1.7422e-01, 1.1470e-07, 9.8935e-01, 6.2029e-05],
          [2.0612e-02, 5.3734e-01, 2.2750e-12, 1.0458e-09, 1.5609e-05],
          [1.2175e-05, 2.8844e-01, 1.5056e-10, 1.3235e-08, 1.9560e-08],
          [3.3393e-09, 1.0687e-

In [442]:
torch.manual_seed(0)
x = torch.empty(2, 5).uniform_(-10, 20)
y = nn.LogSoftmax(-1)(x)
z = F.log_softmax(x, dim=-1) # log(Softmax(x)) 
x, y, z, (F.softmax(x, -1).log())

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692]]),
 tensor([[-8.1592e+00, -2.8701e-04, -2.0393e+01, -1.9086e+01, -1.3824e+01],
         [-7.8717e+00, -1.2191e+01, -7.5014e-04, -1.3225e+01, -7.9249e+00]]),
 tensor([[-8.1592e+00, -2.8701e-04, -2.0393e+01, -1.9086e+01, -1.3824e+01],
         [-7.8717e+00, -1.2191e+01, -7.5014e-04, -1.3225e+01, -7.9249e+00]]),
 tensor([[-8.1592e+00, -2.8704e-04, -2.0393e+01, -1.9086e+01, -1.3824e+01],
         [-7.8717e+00, -1.2191e+01, -7.5017e-04, -1.3225e+01, -7.9249e+00]]))

In [445]:
torch.manual_seed(0)
x = torch.empty(2,  5).uniform_(-10, 20)
y = nn.Softmin(-2)(x)
z = F.softmin(x, dim=-2)
x, y, z, y.sum(-2)

(tensor([[ 4.8877, 13.0467, -7.3457, -6.0391, -0.7773],
         [ 9.0224,  4.7028, 16.8933,  3.6688,  8.9692]]),
 tensor([[9.8424e-01, 2.3780e-04, 1.0000e+00, 9.9994e-01, 9.9994e-01],
         [1.5756e-02, 9.9976e-01, 2.9725e-11, 6.0796e-05, 5.8495e-05]]),
 tensor([[9.8424e-01, 2.3780e-04, 1.0000e+00, 9.9994e-01, 9.9994e-01],
         [1.5756e-02, 9.9976e-01, 2.9725e-11, 6.0796e-05, 5.8495e-05]]),
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000]))

In [446]:
torch.manual_seed(0)
x = torch.empty(2,  5).uniform_(-10, 20)
F.softmax(x, dim=-2), F.softmin(x, dim=-2)

(tensor([[1.5756e-02, 9.9976e-01, 2.9725e-11, 6.0796e-05, 5.8495e-05],
         [9.8424e-01, 2.3780e-04, 1.0000e+00, 9.9994e-01, 9.9994e-01]]),
 tensor([[9.8424e-01, 2.3780e-04, 1.0000e+00, 9.9994e-01, 9.9994e-01],
         [1.5756e-02, 9.9976e-01, 2.9725e-11, 6.0796e-05, 5.8495e-05]]))

## Normalization Layers

In [457]:
torch.manual_seed(0)
net = nn.Sequential(nn.Linear(3, 5), nn.BatchNorm1d(5), nn.Linear(5, 1)) # Normalize per feature
x = torch.randn(4, 3)
net(x)

tensor([[ 0.8508],
        [-1.2494],
        [ 0.1558],
        [ 0.4321]], grad_fn=<AddmmBackward0>)

In [454]:
bn1d = net[1]
bn1d

BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [455]:
bn1d.running_mean.shape

torch.Size([5])

In [459]:
bn1d.weight, bn1d.bias

(Parameter containing:
 tensor([1., 1., 1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0.], requires_grad=True))

In [464]:
torch.manual_seed(0)
net = nn.Sequential(nn.Conv1d(3, 16, 3), nn.BatchNorm1d(16), nn.Flatten(1, 2), nn.Linear(16*8, 1)) # Normalize per feature
x = torch.randn(4, 3, 10)
net(x)

tensor([[ 0.5796],
        [-0.2985],
        [-0.4648],
        [-0.3327]], grad_fn=<AddmmBackward0>)

In [465]:
bn1d = net[1]
bn1d

BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [466]:
bn1d.weight, bn1d.bias

(Parameter containing:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        requires_grad=True))

In [469]:
bn1d.running_mean

tensor([-3.6580e-02, -2.3724e-02,  3.8010e-03,  7.1910e-03,  7.4824e-03,
        -2.5950e-02, -2.6184e-02, -1.8934e-02,  2.5612e-02,  2.5042e-02,
         3.6823e-02, -2.1588e-05,  1.9561e-02, -1.0119e-02, -1.7656e-02,
        -2.9119e-02])

In [476]:
# Batch normalized via per (C, H, W) over N batches
torch.manual_seed(0)
net = nn.Sequential(nn.Conv2d(3, 16, 3), nn.BatchNorm2d(16), nn.Flatten(1, -1), nn.LazyLinear(1)) # Normalize per feature
x = torch.randn(4, 3, 64, 64)
net(x).size()

torch.Size([4, 1])

In [477]:
bn2d = net[1]
bn2d

BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [478]:
bn2d.weight, bn2d.bias

(Parameter containing:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        requires_grad=True))

In [479]:
bn2d.running_mean

tensor([ 0.0170,  0.0032, -0.0127, -0.0071,  0.0042, -0.0151,  0.0101, -0.0169,
        -0.0184, -0.0173, -0.0031,  0.0136, -0.0192,  0.0156, -0.0091,  0.0052])

In [484]:
torch.manual_seed(0)
net = nn.Sequential(nn.Identity(), nn.LayerNorm(64), nn.Flatten(1, -1), nn.LazyLinear(1)) # Normalize per feature
x = torch.randn(4, 128, 64) # Seq: BxTxd
net(x).size()

torch.Size([4, 1])

In [485]:
ln = net[1]

In [486]:
ln.weight, ln.bias

(Parameter containing:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        requires_grad=True))

In [487]:
ln.weight.size(), ln.bias.size()

(torch.Size([64]), torch.Size([64]))

In [488]:
# Batch normalized via per (C, H, W) over N batches
torch.manual_seed(0)
net = nn.Sequential(nn.Conv2d(3, 32, 3, padding="same"), 
                    nn.LayerNorm([32, 64, 64]),  # Normalize per feature
                    nn.Flatten(1, -1), 
                    nn.LazyLinear(1))
x = torch.randn(4, 3, 64, 64) 
net(x).size()



torch.Size([4, 1])

In [489]:
ln = net[1]

In [491]:
ln.weight.shape, ln.bias.shape

(torch.Size([32, 64, 64]), torch.Size([32, 64, 64]))

In [511]:
torch.manual_seed(0)
net = nn.Sequential(nn.Conv1d(3, 16, 3), nn.InstanceNorm1d(16, affine=True), nn.Flatten(1, 2), nn.Linear(16*8, 1)) # Normalize per feature
x = torch.randn(4, 3, 10)
net(x)
net(x)
net(x)

tensor([[ 0.4504],
        [-0.1148],
        [-0.3610],
        [-0.3393]], grad_fn=<AddmmBackward0>)

In [513]:
inbn = net[1]

In [514]:
inbn

InstanceNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)

In [515]:
inbn.weight

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       requires_grad=True)

In [516]:
inbn.weight.shape

torch.Size([16])

In [520]:
# Batch normalized via per (C, H, W) over N batches
torch.manual_seed(0)
net = nn.Sequential(nn.Conv2d(3, 32, 3, padding="same"), 
                    nn.InstanceNorm2d(32, affine=True),  # Normalize per feature
                    nn.Flatten(1, -1), 
                    nn.LazyLinear(1))
x = torch.randn(4, 3, 64, 64) 
net(x).size()



torch.Size([4, 1])

In [521]:
inbn = net[1]

In [522]:
inbn

InstanceNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)

In [523]:
inbn.weight

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       requires_grad=True)

In [530]:
# Batch normalized via per (C, H, W) over N batches
torch.manual_seed(0)
net = nn.Sequential(nn.Conv2d(3, 32, 3, padding="same"), 
                    nn.GroupNorm(4, 32, affine=True),  # Normalize per feature
                    nn.Flatten(1, -1), 
                    nn.LazyLinear(1))
x = torch.randn(4, 3, 64, 64) 
net(x).size()



torch.Size([4, 1])

In [531]:
gpbn = net[1]

In [532]:
gpbn

GroupNorm(4, 32, eps=1e-05, affine=True)

In [533]:
gpbn.weight.size()

torch.Size([32])

In [535]:
gpbn.bias.size()

torch.Size([32])

## Recurrent Layers

In [721]:
torch.manual_seed(0)
rnn = nn.RNN(input_size=5, hidden_size=15, num_layers=1,
             nonlinearity="tanh", bias=True, bidirectional=False, 
             batch_first=True)
x = torch.randn(4, 10, 5)
out, last_state = rnn(x) #out is h[1:t]

In [722]:
out.size(), last_state.size()

(torch.Size([4, 10, 15]), torch.Size([1, 4, 15]))

In [723]:
out[..., -1, :].size()

torch.Size([4, 15])

In [724]:
out.select(dim=1, index=torch.tensor(9))

tensor([[ 0.1031, -0.5137, -0.0354,  0.0951, -0.2973, -0.1689,  0.0404, -0.5558,
          0.0197, -0.2016, -0.2240, -0.1106,  0.5820,  0.4665, -0.0272],
        [ 0.1308,  0.1390, -0.0039, -0.0518,  0.2764,  0.1299, -0.0134,  0.0677,
         -0.0610,  0.3481,  0.5619,  0.2291,  0.2706,  0.0643,  0.0280],
        [-0.2806,  0.1678, -0.5192,  0.3800, -0.2179, -0.7108, -0.2451,  0.2495,
          0.4239,  0.2261, -0.0591, -0.3125,  0.1801,  0.1598,  0.3343],
        [ 0.2281,  0.1690, -0.3100,  0.5432, -0.3815, -0.3226, -0.1621, -0.0237,
         -0.1821,  0.0645,  0.0245, -0.1639, -0.1053,  0.2882, -0.2862]],
       grad_fn=<SelectBackward0>)

In [725]:
last_state

tensor([[[ 0.1031, -0.5137, -0.0354,  0.0951, -0.2973, -0.1689,  0.0404,
          -0.5558,  0.0197, -0.2016, -0.2240, -0.1106,  0.5820,  0.4665,
          -0.0272],
         [ 0.1308,  0.1390, -0.0039, -0.0518,  0.2764,  0.1299, -0.0134,
           0.0677, -0.0610,  0.3481,  0.5619,  0.2291,  0.2706,  0.0643,
           0.0280],
         [-0.2806,  0.1678, -0.5192,  0.3800, -0.2179, -0.7108, -0.2451,
           0.2495,  0.4239,  0.2261, -0.0591, -0.3125,  0.1801,  0.1598,
           0.3343],
         [ 0.2281,  0.1690, -0.3100,  0.5432, -0.3815, -0.3226, -0.1621,
          -0.0237, -0.1821,  0.0645,  0.0245, -0.1639, -0.1053,  0.2882,
          -0.2862]]], grad_fn=<StackBackward0>)

In [726]:
# Add initial state
torch.manual_seed(0)
rnn = nn.RNN(input_size=5, hidden_size=15, num_layers=1,
             nonlinearity="tanh", bias=True, bidirectional=False, 
             batch_first=True)
x = torch.randn(4, 10, 5)
h0 = torch.randn(1, 15).expand(1, 4, 15)
out, last_state = rnn(x, h0)

In [727]:
out.size(), last_state.size()

(torch.Size([4, 10, 15]), torch.Size([1, 4, 15]))

In [728]:
out[..., -1, :]

tensor([[ 0.1039, -0.5126, -0.0375,  0.0973, -0.2966, -0.1689,  0.0414, -0.5548,
          0.0192, -0.2014, -0.2252, -0.1116,  0.5806,  0.4662, -0.0261],
        [ 0.1304,  0.1412, -0.0079, -0.0492,  0.2770,  0.1274, -0.0102,  0.0709,
         -0.0611,  0.3497,  0.5600,  0.2267,  0.2676,  0.0621,  0.0294],
        [-0.2853,  0.1707, -0.5246,  0.3821, -0.2171, -0.7135, -0.2385,  0.2545,
          0.4260,  0.2320, -0.0649, -0.3162,  0.1750,  0.1533,  0.3349],
        [ 0.2257,  0.1705, -0.3156,  0.5443, -0.3814, -0.3289, -0.1560, -0.0181,
         -0.1811,  0.0684,  0.0195, -0.1682, -0.1094,  0.2835, -0.2848]],
       grad_fn=<SliceBackward0>)

In [729]:
last_state

tensor([[[ 0.1039, -0.5126, -0.0375,  0.0973, -0.2966, -0.1689,  0.0414,
          -0.5548,  0.0192, -0.2014, -0.2252, -0.1116,  0.5806,  0.4662,
          -0.0261],
         [ 0.1304,  0.1412, -0.0079, -0.0492,  0.2770,  0.1274, -0.0102,
           0.0709, -0.0611,  0.3497,  0.5600,  0.2267,  0.2676,  0.0621,
           0.0294],
         [-0.2853,  0.1707, -0.5246,  0.3821, -0.2171, -0.7135, -0.2385,
           0.2545,  0.4260,  0.2320, -0.0649, -0.3162,  0.1750,  0.1533,
           0.3349],
         [ 0.2257,  0.1705, -0.3156,  0.5443, -0.3814, -0.3289, -0.1560,
          -0.0181, -0.1811,  0.0684,  0.0195, -0.1682, -0.1094,  0.2835,
          -0.2848]]], grad_fn=<StackBackward0>)

In [730]:
# Add dropout with 4 layers
torch.manual_seed(0)
net = nn.RNN(input_size=5, hidden_size=15, 
             num_layers=4,
             nonlinearity="tanh", bias=True, bidirectional=False, 
             batch_first=True, dropout=0.1)
x = torch.randn(4, 10, 5)
h0 = torch.randn(1, 15).expand(4, 4, 15) # (num_directions x num_layers) x N x hout
out, last_state = net(x)

In [731]:
out.size()

torch.Size([4, 10, 15])

In [732]:
last_state.size()

torch.Size([4, 4, 15])

In [735]:
last_state[-1]

tensor([[-0.3555, -0.1993, -0.2419,  0.2056,  0.1020, -0.1344, -0.1263, -0.0382,
         -0.2619,  0.1673, -0.3040,  0.3187,  0.3298, -0.3268, -0.3188],
        [-0.3577, -0.4380, -0.1571,  0.2563,  0.2929,  0.0064, -0.0551,  0.3024,
         -0.2987, -0.0031, -0.2763,  0.1839,  0.4808, -0.3969, -0.4387],
        [-0.3215, -0.0402, -0.2281,  0.1372,  0.2479, -0.1711,  0.0812, -0.0707,
         -0.0923, -0.0108, -0.3916,  0.2852,  0.2017, -0.2332, -0.3347],
        [-0.2470, -0.4806, -0.0573,  0.2556,  0.2663, -0.1015, -0.3573,  0.2632,
         -0.2112,  0.0563, -0.1438,  0.1009,  0.4365, -0.4586, -0.4787]],
       grad_fn=<SelectBackward0>)

In [739]:
out[:, -1, :] # HT

tensor([[-0.3555, -0.1993, -0.2419,  0.2056,  0.1020, -0.1344, -0.1263, -0.0382,
         -0.2619,  0.1673, -0.3040,  0.3187,  0.3298, -0.3268, -0.3188],
        [-0.3577, -0.4380, -0.1571,  0.2563,  0.2929,  0.0064, -0.0551,  0.3024,
         -0.2987, -0.0031, -0.2763,  0.1839,  0.4808, -0.3969, -0.4387],
        [-0.3215, -0.0402, -0.2281,  0.1372,  0.2479, -0.1711,  0.0812, -0.0707,
         -0.0923, -0.0108, -0.3916,  0.2852,  0.2017, -0.2332, -0.3347],
        [-0.2470, -0.4806, -0.0573,  0.2556,  0.2663, -0.1015, -0.3573,  0.2632,
         -0.2112,  0.0563, -0.1438,  0.1009,  0.4365, -0.4586, -0.4787]],
       grad_fn=<SliceBackward0>)

In [825]:
# Add Bidirectional
torch.manual_seed(0)
rnn = nn.RNN(input_size=5, 
             hidden_size=3, 
             num_layers=3,
             nonlinearity="tanh", 
             bias=True, 
             bidirectional=True, 
             batch_first=True, 
            #  dropout=0.
             )
# 
x = torch.randn(4, 7, 5).abs()
h0 = torch.randn(1, 3).expand(6, 4, 3) # (num_directions x num_layers) x N x hout
out, last_state = rnn(x, h0) # concatenation

In [826]:
out.size(), last_state.size()

(torch.Size([4, 7, 6]), torch.Size([6, 4, 3]))

In [827]:
out[:, -1, ...].size()

torch.Size([4, 6])

In [828]:
out[:, -1, ...]

tensor([[ 0.4133, -0.1093, -0.6090,  0.6467, -0.2729, -0.2605],
        [ 0.2957, -0.0686, -0.5352,  0.6112, -0.2638, -0.2600],
        [ 0.1930, -0.0527, -0.3886,  0.5614, -0.3590, -0.3354],
        [ 0.3704, -0.0460, -0.5626,  0.6116, -0.0888, -0.1524]],
       grad_fn=<SelectBackward0>)

In [829]:
out.select(dim=1, index=-1).size()

torch.Size([4, 6])

In [830]:
out.select(dim=1, index=-1).index_select(dim=1, index=torch.tensor([0, 1, 2])) # forward last hidden state

tensor([[ 0.4133, -0.1093, -0.6090],
        [ 0.2957, -0.0686, -0.5352],
        [ 0.1930, -0.0527, -0.3886],
        [ 0.3704, -0.0460, -0.5626]], grad_fn=<IndexSelectBackward0>)

In [831]:
out.select(dim=1, index=-1).index_select(dim=1, index=torch.tensor([3, 4, 5]))

tensor([[ 0.6467, -0.2729, -0.2605],
        [ 0.6112, -0.2638, -0.2600],
        [ 0.5614, -0.3590, -0.3354],
        [ 0.6116, -0.0888, -0.1524]], grad_fn=<IndexSelectBackward0>)

In [832]:
last_state.size()

torch.Size([6, 4, 3])

In [833]:
# Last layer Forward last state from out
hf_from_out = out.select(dim=1, index=-1)[:, 0:3]
hf_from_out

tensor([[ 0.4133, -0.1093, -0.6090],
        [ 0.2957, -0.0686, -0.5352],
        [ 0.1930, -0.0527, -0.3886],
        [ 0.3704, -0.0460, -0.5626]], grad_fn=<SliceBackward0>)

In [834]:
last_state.view(3, 2, 4, 3)[-1][0]

tensor([[ 0.4133, -0.1093, -0.6090],
        [ 0.2957, -0.0686, -0.5352],
        [ 0.1930, -0.0527, -0.3886],
        [ 0.3704, -0.0460, -0.5626]], grad_fn=<SelectBackward0>)

In [835]:
# Last layer Forward last state from out
# Last state dim = (num_layer * 2, B, hz)
hf_from_last_state = last_state.view(3, 2, 4, 3)[-1][0]
hf_from_last_state

tensor([[ 0.4133, -0.1093, -0.6090],
        [ 0.2957, -0.0686, -0.5352],
        [ 0.1930, -0.0527, -0.3886],
        [ 0.3704, -0.0460, -0.5626]], grad_fn=<SelectBackward0>)

In [836]:
# Last layer Backward 
hb_from_out = out.select(dim=1, index=0)[:, 3:]
hb_from_out

tensor([[-0.0892, -0.1868, -0.3310],
        [-0.1399, -0.2370, -0.4548],
        [ 0.0632, -0.2535, -0.5286],
        [-0.1388, -0.0617, -0.4278]], grad_fn=<SliceBackward0>)

In [837]:
# Last layer Backward 
hb_from_last_state =  last_state.view(3, 2, 4, 3)[-1][1]
hb_from_last_state

tensor([[-0.0892, -0.1868, -0.3310],
        [-0.1399, -0.2370, -0.4548],
        [ 0.0632, -0.2535, -0.5286],
        [-0.1388, -0.0617, -0.4278]], grad_fn=<SelectBackward0>)

In [839]:
rnn.weight_ih_l0

Parameter containing:
tensor([[-0.0043,  0.3097, -0.4752, -0.4249, -0.2224],
        [ 0.1548, -0.0114,  0.4578, -0.0512,  0.1528],
        [-0.1745, -0.1135, -0.5516, -0.3824, -0.2380]], requires_grad=True)

In [840]:
rnn.weight_ih_l0_reverse

Parameter containing:
tensor([[-0.2251,  0.4988, -0.3742, -0.2658, -0.4034],
        [-0.5407, -0.3370,  0.4963,  0.2576,  0.2798],
        [ 0.0304, -0.2960,  0.0977, -0.5391, -0.4172]], requires_grad=True)

In [844]:
rnn.weight_ih_l0.size(), rnn.weight_ih_l0_reverse.size(), rnn.bias_ih_l0.size(),

(torch.Size([3, 5]), torch.Size([3, 5]), torch.Size([3]))

In [845]:
rnn.weight_ih_l1.size(), rnn.weight_ih_l1_reverse.size(), rnn.bias_ih_l1.size(),

(torch.Size([3, 6]), torch.Size([3, 6]), torch.Size([3]))

In [846]:
rnn.weight_ih_l2.size(), rnn.weight_ih_l2_reverse.size(), rnn.bias_ih_l2.size(),

(torch.Size([3, 6]), torch.Size([3, 6]), torch.Size([3]))

In [851]:
# LSTM
sz, bz, dz, hz = 10, 4, 5, 7
lz = 3 # num_layers
x = torch.randn(bz, sz, dz)
h0 = torch.randn(hz).expand(lz * 2, bz, hz)
c0 = h0.clone()
lstm = nn.LSTM(input_size=dz,
               hidden_size=hz,
               num_layers=lz,
               dropout=0.1,
               bias=True,
               bidirectional=True,
               batch_first=True
               )
out, (hn, cn) = lstm(x, (h0, c0))

In [852]:
out.size()

torch.Size([4, 10, 14])

In [856]:
out_forward, out_backward = out.chunk(2, -1)
out_forward.size(), out_backward.size()

(torch.Size([4, 10, 7]), torch.Size([4, 10, 7]))

In [858]:
hn.size()

torch.Size([6, 4, 7])

In [860]:
lstm.weight_ih_l0.size()

torch.Size([28, 5])

In [861]:
W_ii, W_if, W_ig, W_io = lstm.weight_ih_l0.tensor_split(4)

In [862]:
lstm.weight_ih_l0

Parameter containing:
tensor([[ 0.2079,  0.2255,  0.2432, -0.2417, -0.2997],
        [-0.0129,  0.1313,  0.1720, -0.2905,  0.3107],
        [ 0.3659, -0.0297,  0.0599,  0.0659, -0.2803],
        [-0.3596,  0.2339,  0.3048, -0.0097, -0.3287],
        [ 0.0155, -0.2475,  0.1294,  0.3455,  0.3775],
        [ 0.3115,  0.0868,  0.3712, -0.0731, -0.0125],
        [-0.2596,  0.1572,  0.1477,  0.1531,  0.2721],
        [ 0.3269, -0.3571, -0.1071, -0.3021,  0.2872],
        [-0.0118, -0.3361,  0.1604, -0.2317, -0.0070],
        [ 0.2016,  0.2102,  0.2349, -0.0068, -0.0368],
        [-0.0175, -0.3725, -0.1343,  0.2876,  0.1934],
        [-0.3458,  0.0943,  0.3473, -0.3005, -0.0379],
        [-0.2031, -0.0054, -0.1649,  0.3311,  0.1962],
        [-0.0732, -0.3273, -0.2011, -0.0399,  0.1017],
        [-0.1528, -0.2315, -0.0464, -0.3007,  0.0841],
        [-0.3395, -0.0643, -0.3166, -0.1481,  0.0041],
        [-0.1831, -0.2477,  0.0426, -0.1919,  0.0207],
        [ 0.0805, -0.0400, -0.3303, -0.1077

In [863]:
W_ii

tensor([[ 0.2079,  0.2255,  0.2432, -0.2417, -0.2997],
        [-0.0129,  0.1313,  0.1720, -0.2905,  0.3107],
        [ 0.3659, -0.0297,  0.0599,  0.0659, -0.2803],
        [-0.3596,  0.2339,  0.3048, -0.0097, -0.3287],
        [ 0.0155, -0.2475,  0.1294,  0.3455,  0.3775],
        [ 0.3115,  0.0868,  0.3712, -0.0731, -0.0125],
        [-0.2596,  0.1572,  0.1477,  0.1531,  0.2721]],
       grad_fn=<SliceBackward0>)

In [864]:
W_ii, W_if, W_ig, W_io = lstm.weight_ih_l1.tensor_split(4)

In [865]:
lstm.weight_ih_l1.size(), W_ii.size()

(torch.Size([28, 14]), torch.Size([7, 14]))

In [866]:
lstm.weight_ih_l1_reverse.size()

torch.Size([28, 14])

In [867]:
W_hi, W_hf, W_hg, W_ho = lstm.weight_hh_l1.tensor_split(4)

In [868]:
W_hi.size(), lstm.weight_hh_l1.size()

(torch.Size([7, 7]), torch.Size([28, 7]))

In [894]:
# LSTM
torch.manual_seed(0)
sz, bz, dz, hz = 10, 4, 5, 7
lz = 1 # num_layers
x = torch.randn(bz, sz, dz)
h0 = torch.randn(hz).expand(1, bz, hz).clone()
rnn = nn.RNN(input_size=dz,
               hidden_size=hz,
               bias=True,
               nonlinearity="tanh",
               batch_first=True
               )
out, hn = rnn(x, h0)

In [895]:
out.size(), x.size()

(torch.Size([4, 10, 7]), torch.Size([4, 10, 5]))

In [896]:
torch.manual_seed(0)

rnn_cell = nn.RNNCell(dz, hz, bias=True, nonlinearity="tanh")
with torch.no_grad():
    rnn_cell.weight_hh.data.copy_(rnn.weight_hh_l0,)
    rnn_cell.weight_ih.data.copy_(rnn.weight_ih_l0)
    rnn_cell.bias_ih.data.copy_(rnn.bias_ih_l0)
    rnn_cell.bias_hh.data.copy_(rnn.bias_hh_l0)


In [897]:
ht = h0.clone().squeeze(0)
print(ht.size())
out_cells = []
h = []
for t in range(sz):
    cell_h = rnn_cell(x[:, t, :], ht)
    out_cells.append(cell_h)
    ht = cell_h

torch.Size([4, 7])


In [898]:
torch.stack(out_cells, 1).size()

torch.Size([4, 10, 7])

In [900]:
(torch.stack(out_cells, 1) - out).sum()

tensor(1.4156e-07, grad_fn=<SumBackward0>)

In [903]:
torch.manual_seed(0)

lstm_cell = nn.LSTMCell(dz, hz, bias=True)
h0 = torch.randn(hz).expand(bz, hz).clone()
c0 = torch.randn(hz).expand(bz, hz).clone()

ht = h0
ct = c0

hn_cells = []
cn_cells = []

for t in range(sz):
    h_next, c_next = lstm_cell(x[:, t, :], (ht, ct))
    hn_cells.append(h_next)
    cn_cells.append(c_next)
    ht = h_next
    ct = c_next
    

In [904]:
torch.stack(hn_cells, dim=1).size()

torch.Size([4, 10, 7])

In [906]:
lstm_cell.weight_ih

Parameter containing:
tensor([[-0.0028,  0.2028, -0.3111, -0.2782, -0.1456],
        [ 0.1014, -0.0075,  0.2997, -0.0335,  0.1000],
        [-0.1142, -0.0743, -0.3611, -0.2503, -0.1558],
        [ 0.0140,  0.1494,  0.2268, -0.2562, -0.1646],
        [ 0.1373,  0.3139, -0.0778,  0.2828, -0.0609],
        [ 0.0400,  0.3422, -0.3506, -0.2379, -0.0957],
        [-0.1473,  0.3266, -0.2450, -0.1740, -0.2641],
        [-0.3540, -0.2206,  0.3249,  0.1687,  0.1832],
        [ 0.0199, -0.1938,  0.0639, -0.3529, -0.2731],
        [-0.1949,  0.2385,  0.2216, -0.1676, -0.0136],
        [ 0.2417,  0.3757,  0.1500,  0.0511,  0.2534],
        [-0.2225,  0.0704, -0.2930, -0.2620, -0.1953],
        [ 0.1710,  0.1520, -0.2239,  0.1142,  0.2075],
        [-0.0477,  0.0144,  0.0876,  0.2345,  0.3629],
        [-0.2913, -0.1385,  0.1485,  0.3132,  0.3289],
        [ 0.3335,  0.0752, -0.3287,  0.0348, -0.2365],
        [-0.3522,  0.3358,  0.2874, -0.3770,  0.0707],
        [-0.0637, -0.0622, -0.1730,  0.1453

In [913]:
# LSTM
torch.manual_seed(0)
sz, bz, dz, hz = 10, 4, 5, 7
lz = 1 # num_layers
x = torch.randn(bz, sz, dz)
lstm = nn.LSTM(input_size=dz,
               hidden_size=hz,
               bias=True,
               batch_first=True
               )
out, (hn, cn) = lstm(x)
# out and hn are the same !!! Careful

In [914]:
out.size()

torch.Size([4, 10, 7])

In [915]:
out[:, -1, :]

tensor([[-0.0366,  0.0946,  0.0368,  0.0538,  0.0950,  0.3502, -0.5945],
        [ 0.0688,  0.0639, -0.3101,  0.0012,  0.1544,  0.3090, -0.3091],
        [ 0.0487,  0.0416,  0.0409,  0.2507,  0.2483,  0.1582, -0.4065],
        [ 0.0980,  0.1033,  0.1533,  0.3093,  0.2307,  0.0691, -0.4459]],
       grad_fn=<SliceBackward0>)

In [916]:
hn

tensor([[[-0.0366,  0.0946,  0.0368,  0.0538,  0.0950,  0.3502, -0.5945],
         [ 0.0688,  0.0639, -0.3101,  0.0012,  0.1544,  0.3090, -0.3091],
         [ 0.0487,  0.0416,  0.0409,  0.2507,  0.2483,  0.1582, -0.4065],
         [ 0.0980,  0.1033,  0.1533,  0.3093,  0.2307,  0.0691, -0.4459]]],
       grad_fn=<StackBackward0>)

## Transformers

In [996]:
torch.manual_seed(0)
x = torch.randn(4, 10, 64)
mask = torch.arange(0, 10).expand(4, 10, 10).clone()
index_mask = torch.randint(5, 10, size=(4, 10, 10))
mask = mask >= index_mask


In [997]:
src_key_padding_mask = torch.arange(0, 10).expand(4, 10).clone()
src_idx_mask = torch.randint(3, 10, size=(4,)).unsqueeze(1)
src_key_padding_mask = src_key_padding_mask >= src_idx_mask
src_key_padding_mask
x[src_key_padding_mask] *= 0.
# Additive attention for src
src_mask = torch.randn(10, 10)

In [998]:
x[src_key_padding_mask].sum()

tensor(0.)

In [999]:
tfm_enc_layer = nn.TransformerEncoderLayer(d_model=64, nhead=8,
                                               dim_feedforward=128, dropout=0.1,
                                               activation=F.gelu, batch_first=True, 
                                               norm_first=False
                                               )
y = tfm_enc_layer(x, src_key_padding_mask=src_key_padding_mask, src_mask=src_mask)
y.size()

torch.Size([4, 10, 64])

In [1000]:
y[0, 0]

tensor([-0.8958, -1.0957,  0.0394, -0.2223,  0.5634,  0.1066, -0.6516, -1.6670,
         0.1621, -0.5294, -0.0759, -0.1421,  0.0188,  0.9326,  0.7924,  0.1720,
        -1.3745, -1.7386, -0.0750,  1.1500,  0.5301, -2.0552, -0.5857,  1.3363,
         0.4073, -0.6344, -0.0954,  0.3517,  0.9651,  1.4621,  0.8214, -0.7455,
        -0.5382, -0.1444, -0.2969, -0.2163,  0.1266,  0.5564,  0.8943,  0.6638,
        -0.4990, -0.3543, -0.6840, -0.0483,  0.4670,  2.4288, -1.5141, -1.0092,
        -0.3794,  0.4454,  1.0467, -0.1036, -0.9263,  0.3585,  0.4240, -0.1514,
        -0.2859,  0.4786,  0.9243,  3.6044, -2.1874, -1.5147,  1.5502, -0.3425],
       grad_fn=<SelectBackward0>)

In [1001]:
x[0, 0]

tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
         0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473,
        -1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530,
         0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437,
        -0.6136,  0.0316, -0.4927,  0.2484,  0.4397,  0.1124,  0.6408,  0.4412,
        -0.1023,  0.7924, -0.2897,  0.0525,  0.5229,  2.3022, -1.4689, -1.5867,
        -0.6731,  0.8728,  1.0554,  0.1778, -0.2303, -0.3918,  0.5433, -0.3952,
        -0.4462,  0.7440,  1.5210,  3.4105, -1.5312, -1.2341,  1.8197, -0.5515])

In [1017]:
tfm_enc = nn.TransformerEncoder(tfm_enc_layer, 2, nn.LayerNorm(64))

In [1018]:
y = tfm_enc(x, src_key_padding_mask=src_key_padding_mask.bool(), mask=src_mask.float())

In [1019]:
y.size()

torch.Size([4, 10, 64])

In [1076]:
tfm_dec_layer = nn.TransformerDecoderLayer(d_model=64, 
                                           nhead=8,
                                           dim_feedforward=128, 
                                            dropout=0.1,
                                            activation=F.gelu, 
                                            batch_first=True, 
                                            norm_first=False
                                               )

In [1077]:
tgt_mask = torch.empty(5, 5).fill_(False).bool()

In [1078]:
tgt_mask

tensor([[False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False]])

In [1079]:
tgt_mask[torch.triu_indices(5, 5, offset=1).chunk(2)] = True

In [1080]:
tgt_mask

tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])

In [1081]:
tgt_mask = tgt_mask.float()
tgt_mask = tgt_mask.where(tgt_mask != 1.0, float("-inf"))

In [1082]:
tgt_mask

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [1083]:
z0 = tfm_dec_layer(torch.randn(4, 5, 64),memory=y, 
                   memory_key_padding_mask=src_key_padding_mask,
                   tgt_mask=tgt_mask
                   )

In [1084]:
z0.shape

torch.Size([4, 5, 64])

In [1085]:
tfm_dec = nn.TransformerDecoder(decoder_layer=tfm_dec_layer, num_layers=2, 
                                      norm=nn.LayerNorm(64))


In [1086]:
zd = tfm_dec(torch.randn(4, 5, 64), memory=y, 
                   memory_key_padding_mask=src_key_padding_mask,
                   tgt_mask=tgt_mask
                   )

In [1087]:
zd.shape

torch.Size([4, 5, 64])

In [1088]:
tfm = nn.Transformer(d_model=64, nhead=8, num_decoder_layers=2, num_encoder_layers=2,
                     norm_first=False, batch_first=True, dropout=0.1, dim_feedforward=128, 
                     activation=F.gelu)

In [1090]:
t = tfm(x, torch.randn(4, 5, 64))

In [1091]:
t.size()

torch.Size([4, 5, 64])

## Dropout

In [1101]:
torch.manual_seed(0)
x = torch.randn(4, 10)
drop_net = nn.Dropout(p=0.2)
y = drop_net(x)

In [1102]:
x

tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
          0.3223, -1.2633],
        [ 0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473, -1.3527, -1.6959,
          0.5667,  0.7935],
        [ 0.5988, -1.5551, -0.3414,  1.8530, -0.2159, -0.7425,  0.5627,  0.2596,
         -0.1740, -0.6787],
        [ 0.9383,  0.4889,  1.2032,  0.0845, -1.2001, -0.0048, -0.5181, -0.3067,
         -1.5810,  1.7066]])

In [1103]:
y

tensor([[-1.4073, -1.4405, -0.3132, -0.5423,  1.0609,  0.8650, -0.3950, -2.6440,
          0.4028, -0.0000],
        [ 0.0000,  0.3852,  0.1498,  1.5471,  1.3960, -0.3091, -1.6908, -2.1199,
          0.0000,  0.9919],
        [ 0.0000, -1.9439, -0.4267,  2.3163, -0.2698, -0.9282,  0.7034,  0.3245,
         -0.2175, -0.8484],
        [ 0.0000,  0.6111,  1.5040,  0.0000, -1.5002, -0.0060, -0.0000, -0.3834,
         -1.9762,  2.1333]])

In [1104]:
(y==0).sum().float() / y.numel()

tensor(0.1750)

In [1116]:
torch.manual_seed(0)
x = torch.randn(4, 2, 10)
drop_net = nn.Dropout1d(p=0.2) # zero_out_channels
y = drop_net(x)
y

tensor([[[-1.4073, -1.4405, -0.3132, -0.5423,  1.0609,  0.8650, -0.3950,
          -2.6440,  0.4028, -1.5792],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000, -0.0000,
          -0.0000,  0.0000,  0.0000]],

        [[ 0.7485, -1.9439, -0.4267,  2.3163,  0.9377, -0.7319, -0.2167,
           0.2293,  1.7367,  1.9829],
         [ 1.1829, -1.0546, -0.7670,  0.0395, -0.6158,  0.3105,  0.5496,
           0.1405,  0.8010,  0.5514]],

        [[-0.1279,  0.9906, -0.3621,  0.0656,  0.6536,  2.8778, -1.8361,
          -1.9834, -0.8414,  1.0910],
         [ 1.3192,  0.2223, -0.2879, -0.4897,  0.6791, -0.4939, -0.5578,
           0.9300,  1.9012,  4.2631]],

        [[-1.9140, -1.5427,  2.2747, -0.6894, -0.7116,  1.1500,  1.3885,
           1.6123, -1.8477,  3.2090],
         [-0.5914,  0.4194, -2.0367, -0.6872, -0.5998, -0.6246, -1.3337,
           1.3937, -0.1758,  1.0072]]])

In [1117]:
torch.manual_seed(0)
x = torch.randn(1, 3, 4, 4)
drop_net = nn.Dropout2d(p=0.2) # zero_out entire channels
y = drop_net(x)
y

tensor([[[[-1.4073, -1.4405, -0.3132, -0.5423],
          [ 1.0609,  0.8650, -0.3950, -2.6440],
          [ 0.4028, -1.5792,  0.4375,  0.3852],
          [ 0.1498,  1.5471,  1.3960, -0.3091]],

         [[-1.6908, -2.1199,  0.7083,  0.9919],
          [ 0.7485, -1.9439, -0.4267,  2.3163],
          [ 0.9377, -0.7319, -0.2167,  0.2293],
          [ 1.7367,  1.9829,  1.1829, -1.0546]],

         [[-0.0000,  0.0000, -0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [-0.0000,  0.0000, -0.0000,  0.0000],
          [ 0.0000,  0.0000, -0.0000, -0.0000]]]])

In [1118]:
torch.manual_seed(0)
x = torch.randn(1, 3, 4, 4)
drop_net = nn.AlphaDropout(p=0.2) # zero_out entire channels
y = drop_net(x)
y

tensor([[[[-0.6805, -0.7038, -1.2362, -0.0723],
          [ 1.0550,  0.9172,  0.0313, -1.5500],
          [ 0.5923, -0.8013,  0.6166,  0.5799],
          [ 0.4144, -1.2362,  1.2906,  0.0917]],

         [[-0.8798, -1.1815, -1.2362,  1.0065],
          [ 0.8354, -1.0577,  0.0090,  1.9377],
          [ 0.9684, -0.2056,  0.1566, -1.2362],
          [ 1.5302,  1.7033,  1.1407, -0.4325]],

         [[-0.2302, -1.2362, -0.1240, -1.2362],
          [-1.2362,  0.4078,  0.8722,  0.6968],
          [ 0.2191,  1.0055,  0.0544,  0.3552],
          [ 0.7686,  2.3325, -0.9820, -1.0855]]]])

In [1119]:
torch.manual_seed(0)
x = torch.randn(1, 3, 4, 4)
drop_net = nn.FeatureAlphaDropout(p=0.2) 
y = drop_net(x)
y

tensor([[[[-0.6805, -0.7038,  0.0888, -0.0723],
          [ 1.0550,  0.9172,  0.0313, -1.5500],
          [ 0.5923, -0.8013,  0.6166,  0.5799],
          [ 0.4144,  1.3968,  1.2906,  0.0917]],

         [[-0.8798, -1.1815,  0.8071,  1.0065],
          [ 0.8354, -1.0577,  0.0090,  1.9377],
          [ 0.9684, -0.2056,  0.1566,  0.4703],
          [ 1.5302,  1.7033,  1.1407, -0.4325]],

         [[-1.2362, -1.2362, -1.2362, -1.2362],
          [-1.2362, -1.2362, -1.2362, -1.2362],
          [-1.2362, -1.2362, -1.2362, -1.2362],
          [-1.2362, -1.2362, -1.2362, -1.2362]]]])

In [1187]:
torch.manual_seed(0)
vocab_size = 16
hz = 32
bz, sz = 4, 10
x = torch.randint(0, vocab_size, (bz, sz)).long()
embed_net = nn.Embedding(num_embeddings=vocab_size, 
                         embedding_dim=hz, 
                         padding_idx=0, 
                         max_norm=torch.tensor(5.),                       
                         )
embed_net.weight.size(), embed_net.weight.requires_grad

(torch.Size([16, 32]), True)

In [1188]:
embed_net.weight[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward0>)

In [1189]:
with torch.no_grad():
    nn.init.xavier_normal_(embed_net.weight.data)
    nn.init.constant_(embed_net.weight[0], 0.0)
y = embed_net(x)
y

tensor([[[-0.1271, -0.0727,  0.4007,  ..., -0.0574, -0.2118,  0.2339],
         [ 0.0945,  0.1797, -0.1597,  ..., -0.1739, -0.0697, -0.0282],
         [ 0.0618,  0.0407, -0.5334,  ..., -0.0488, -0.0706,  0.3613],
         ...,
         [ 0.1210, -0.0345, -0.4667,  ..., -0.0368, -0.0038,  0.2037],
         [-0.2074, -0.3122, -0.2486,  ..., -0.4168, -0.1128,  0.1280],
         [-0.0828,  0.1933, -0.0249,  ...,  0.0547, -0.0310,  0.1621]],

        [[ 0.0618,  0.0407, -0.5334,  ..., -0.0488, -0.0706,  0.3613],
         [-0.2134,  0.1431,  0.4307,  ..., -0.0072,  0.1013, -0.1716],
         [-0.0068, -0.0728, -0.0674,  ..., -0.2763,  0.0261,  0.2807],
         ...,
         [-0.1271, -0.0727,  0.4007,  ..., -0.0574, -0.2118,  0.2339],
         [-0.0008,  0.2418,  0.1543,  ..., -0.0217, -0.0468, -0.2838],
         [ 0.2170,  0.1506, -0.0501,  ...,  0.1598, -0.3104, -0.0642]],

        [[ 0.2371,  0.4139, -0.3689,  ..., -0.1160, -0.1894,  0.4319],
         [ 0.1210, -0.0345, -0.4667,  ..., -0

In [1190]:
y.size()

torch.Size([4, 10, 32])

In [1191]:
embed_net.weight.size()

torch.Size([16, 32])

In [1192]:
embed_net.weight[0].requires_grad

True

In [1193]:
pretrained_emb = nn.Embedding.from_pretrained(embed_net.weight, freeze=True, padding_idx=0)

In [1194]:
embed_net.weight[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward0>)

In [1195]:
embed_net.weight[1] == pretrained_emb.weight[1]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

In [1196]:
pretrained_emb.weight.requires_grad

False

## Distance

In [1202]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 1)
x2 = torch.randn(2, 4, 3)
cos_net = nn.CosineSimilarity(dim=1)
y = cos_net(x1, x2)
y, y.size()

(tensor([[-0.4112, -0.8971,  0.4201],
         [-0.4604, -0.5416,  0.0212]]),
 torch.Size([2, 3]))

In [1205]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 1)
pair_net = nn.PairwiseDistance(p=2, keepdim=True)
y = pair_net(x1, x2)
y, y.size()

(tensor([[[2.2233],
          [2.1278],
          [3.3366],
          [1.2553]],
 
         [[0.7609],
          [1.4149],
          [4.0384],
          [1.5720]]]),
 torch.Size([2, 4, 1]))

In [1220]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 3)
l1_loss = nn.L1Loss(reduction="none")
y = l1_loss(x1, x2)
y.size()

torch.Size([2, 4, 3])

In [1219]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 1)
l1_loss = nn.L1Loss(reduction="sum")
y = l1_loss(x1, x2)
y, y.size(), F.l1_loss(x1, x2, reduction="sum")

  return F.l1_loss(input, target, reduction=self.reduction)
  y, y.size(), F.l1_loss(x1, x2, reduction="sum")


(tensor(25.0521), torch.Size([]), tensor(25.0521))

In [1221]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 3)
mse_loss = nn.MSELoss(reduction="sum")
y = mse_loss(x1, x2)
y, y.size(), F.mse_loss(x1, x2, reduction="sum")

(tensor(54.4145), torch.Size([]), tensor(54.4145))

In [1222]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 3)
mse_loss = nn.MSELoss(reduction="none")
y = mse_loss(x1, x2)
y.size(), F.mse_loss(x1, x2, reduction="none").size()

(torch.Size([2, 4, 3]), torch.Size([2, 4, 3]))

In [1224]:
torch.manual_seed(0)
input = torch.randn(2, 4)
target = torch.randn(2, 4).softmax(-1).argmax(1)
cross_ent_net = nn.CrossEntropyLoss(reduction="mean")
y = cross_ent_net(input, target)
y, F.cross_entropy(input, target, reduction="mean")

(tensor(2.1487), tensor(2.1487))

In [1226]:
torch.manual_seed(0)
input = torch.randn(4, 2)
target = torch.randn(4, 2).softmax(-1).argmax(1)
cross_ent_net = nn.CrossEntropyLoss(reduction="mean")
y = cross_ent_net(input, target)
y, F.cross_entropy(input, target, reduction="mean")

(tensor(0.8516), tensor(0.8516))

In [1253]:
torch.manual_seed(100)
logits = torch.randn(4, 2)
target = torch.randn(4, 2).softmax(-1).argmax(1)
print(target)
log_softmax = nn.LogSoftmax(dim=1)(logits)
nlloss = nn.NLLLoss()
y = nlloss(log_softmax, target)
y, F.nll_loss(log_softmax, target, reduction="mean"), F.cross_entropy(logits, target)

tensor([0, 1, 1, 0])


(tensor(0.6411), tensor(0.6411), tensor(0.6411))

In [1258]:
input_binary = logits.softmax(1).select(dim=1, index=1)
target_binary = target
bce_loss = nn.BCELoss(reduction="mean")
y = bce_loss(input_binary, target_binary.float())
y, F.binary_cross_entropy(input_binary, target_binary.float())

(tensor(0.6411), tensor(0.6411))

In [1267]:
logit_binary = logits.softmax(1).select(dim=1, index=1).logit()
target_binary = target
bce_log_loss = nn.BCEWithLogitsLoss(reduction="mean")
y = bce_log_loss(logit_binary, target_binary.float())
y, F.binary_cross_entropy_with_logits(logit_binary, target_binary.float())

(tensor(0.6411), tensor(0.6411))

In [1269]:
torch.manual_seed(0)
x1 = torch.randn(2, 4, 3)
x2 = torch.randn(2, 4, 3)
hubert_loss = nn.HuberLoss(delta=0.5, reduction="mean")
y = hubert_loss(x1, x2)
y, F.huber_loss(x1, x2, delta=0.5, reduction="mean")

(tensor(0.5245), tensor(0.5245))

## Vision Layers

In [1273]:
torch.manual_seed(0)
x = torch.randn(2, 9, 4, 4)
net = nn.PixelShuffle(3)
y = net(x)
x.size(), y.size()

(torch.Size([2, 9, 4, 4]), torch.Size([2, 1, 12, 12]))

In [1275]:
torch.manual_seed(0)
x = torch.randn(2, 1, 12, 12)
net = nn.PixelUnshuffle(3)
y = net(x)
x.size(), y.size()

(torch.Size([2, 1, 12, 12]), torch.Size([2, 9, 4, 4]))

In [1278]:
torch.manual_seed(0)
x = torch.randn(2, 9, 4, 4)
net = nn.Upsample((8, 8), mode="nearest")
y = net(x)
x.size(), y.size()

(torch.Size([2, 9, 4, 4]), torch.Size([2, 9, 8, 8]))

In [1279]:
torch.manual_seed(0)
x = torch.randn(2, 9, 4, 4)
net = nn.Upsample((8, 8), mode="bilinear")
y = net(x)
x.size(), y.size()

(torch.Size([2, 9, 4, 4]), torch.Size([2, 9, 8, 8]))

In [1281]:
torch.manual_seed(0)
x = torch.randn(2, 9, 4, 4)
net = nn.Upsample((8, 8), mode="bicubic")
y = net(x)
x.size(), y.size()

(torch.Size([2, 9, 4, 4]), torch.Size([2, 9, 8, 8]))

In [1282]:
torch.manual_seed(0)
x = torch.randn(2, 9, 4, 4)
net = nn.Upsample(scale_factor=4, mode="bicubic")
y = net(x)
x.size(), y.size()

(torch.Size([2, 9, 4, 4]), torch.Size([2, 9, 16, 16]))