In [1]:
import torch
import time
import numpy as np
from linear_atomic import *
from utilities import *

def timing_fwd(layer, x):
    eval_times = []
    for i in range(10000):
        start = time.time()
        y=layer(x)
        stop = time.time()
        eval_times.append(stop-start)
    eval_times = np.array(eval_times)[100:]*1_000
    print(f"{np.mean(eval_times)} +/- {np.std(eval_times)} ms")

def timing_bwd(layer, x):
    criterion = torch.nn.CrossEntropyLoss()
    eval_times = []
    for i in range(10000):
        start = time.time()
        y=layer(x)
        loss = criterion(y, torch.tensor([0,0,0,0,0]))
        loss.backward()
        stop = time.time()
        eval_times.append(stop-start)
    eval_times = np.array(eval_times)[100:]*1_000
    print(f"{np.mean(eval_times)} +/- {np.std(eval_times)} ms")

def verify(m1, m2, x):
    for i in range(100):
        with torch.no_grad():
            # one order of magnitude smaller then default
            assert torch.allclose(m1(x),m2(x))

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


In [19]:
x = torch.stack([torch.rand((50,)),torch.rand((50,)),torch.rand((50,)),torch.rand((50,)),torch.rand((50,))])
torchlinear = torch.nn.Linear(50,5)
assign_fixed_params(torchlinear)
#tatomiclinear = AtomicLinearTorch(50,5)
atomiclinear = AtomicLinear(50,5)
assign_fixed_params(atomiclinear)
#verify(torchlinear, tatomiclinear, x)
verify(torchlinear, atomiclinear, x)

In [8]:
torchlinear.train()
timing_fwd(torchlinear, x)
timing_fwd(torchlinear, x)
timing_fwd(torchlinear, x)
timing_fwd(torchlinear, x)

0.01688740470192649 +/- 0.0062367398067770825 ms
0.01637567173350941 +/- 0.005354101167111169 ms
0.015930045734752308 +/- 0.007992502326906412 ms
0.015571382310655382 +/- 0.02248797307509372 ms


In [9]:
atomiclinear.train()
timing_fwd(atomiclinear, x)
timing_fwd(atomiclinear, x)
timing_fwd(atomiclinear, x)
timing_fwd(atomiclinear, x)

0.06584102457219904 +/- 0.015801440864727973 ms
0.05933489462341925 +/- 0.055152096572485855 ms
0.05929672356807825 +/- 0.024729653906895428 ms
0.06663223709722962 +/- 0.0240427850455804 ms


In [10]:
torchlinear.train()
timing_bwd(torchlinear, x)
timing_bwd(torchlinear, x)
timing_bwd(torchlinear, x)
timing_bwd(torchlinear, x)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


0.19415275015012182 +/- 0.03382680670509419 ms
0.1822450425889757 +/- 0.025134561913912813 ms
0.1793385756136191 +/- 0.01995179039002832 ms
0.1976560823845141 +/- 0.039601250323909605 ms


In [11]:
atomiclinear.train()
timing_bwd(atomiclinear, x)
timing_bwd(atomiclinear, x)
timing_bwd(atomiclinear, x)
timing_bwd(atomiclinear, x)

0.3188059546730735 +/- 0.04396411505149049 ms
0.32028607647828383 +/- 0.0433615537967176 ms
0.32733820905589095 +/- 0.057573046510339905 ms
0.33203799315173216 +/- 0.07182746638358595 ms


In [4]:
base = atomiclinear(x)

In [5]:
for i in range(100):
    print(torch.sum(atomiclinear(x)-base))

tensor(9.2387e-07, grad_fn=<SumBackward0>)
tensor(8.1956e-08, grad_fn=<SumBackward0>)
tensor(-1.6093e-06, grad_fn=<SumBackward0>)
tensor(1.6689e-06, grad_fn=<SumBackward0>)
tensor(2.9802e-07, grad_fn=<SumBackward0>)
tensor(5.9605e-07, grad_fn=<SumBackward0>)
tensor(3.4571e-06, grad_fn=<SumBackward0>)
tensor(1.7881e-07, grad_fn=<SumBackward0>)
tensor(1.6093e-06, grad_fn=<SumBackward0>)
tensor(1.6093e-06, grad_fn=<SumBackward0>)
tensor(-2.3842e-07, grad_fn=<SumBackward0>)
tensor(4.1723e-07, grad_fn=<SumBackward0>)
tensor(2.6226e-06, grad_fn=<SumBackward0>)
tensor(-3.5763e-07, grad_fn=<SumBackward0>)
tensor(3.4273e-06, grad_fn=<SumBackward0>)
tensor(-5.3644e-07, grad_fn=<SumBackward0>)
tensor(2.5928e-06, grad_fn=<SumBackward0>)
tensor(6.5565e-07, grad_fn=<SumBackward0>)
tensor(2.1458e-06, grad_fn=<SumBackward0>)
tensor(1.5795e-06, grad_fn=<SumBackward0>)
tensor(-2.6226e-06, grad_fn=<SumBackward0>)
tensor(1.7285e-06, grad_fn=<SumBackward0>)
tensor(5.9605e-07, grad_fn=<SumBackward0>)
tensor

In [7]:
x = torch.stack([torch.rand((10,)),torch.rand((10,))])
nam = Classifier(False, 10, 2, 40)
am = Classifier(True, 10, 2, 40)

In [8]:
nam(x)

tensor([[-1.0121, -3.1367],
        [ 6.2277, 11.6180]], grad_fn=<AddmmBackward0>)

In [9]:
am(x)

tensor([[-1.0121, -3.1367],
        [ 6.2277, 11.6180]], grad_fn=<AddBackward0>)

In [10]:
base = am(x)
for i in range(20):
    print(torch.sum(am(x) - base))

tensor(7.1526e-06, grad_fn=<SumBackward0>)
tensor(8.1062e-06, grad_fn=<SumBackward0>)
tensor(9.2983e-06, grad_fn=<SumBackward0>)
tensor(6.4373e-06, grad_fn=<SumBackward0>)
tensor(6.1989e-06, grad_fn=<SumBackward0>)
tensor(7.1526e-07, grad_fn=<SumBackward0>)
tensor(4.7684e-06, grad_fn=<SumBackward0>)
tensor(0., grad_fn=<SumBackward0>)
tensor(3.8147e-06, grad_fn=<SumBackward0>)
tensor(2.3842e-06, grad_fn=<SumBackward0>)
tensor(6.4373e-06, grad_fn=<SumBackward0>)
tensor(6.6757e-06, grad_fn=<SumBackward0>)
tensor(-3.8147e-06, grad_fn=<SumBackward0>)
tensor(7.6294e-06, grad_fn=<SumBackward0>)
tensor(8.5831e-06, grad_fn=<SumBackward0>)
tensor(5.4836e-06, grad_fn=<SumBackward0>)
tensor(6.4373e-06, grad_fn=<SumBackward0>)
tensor(8.8215e-06, grad_fn=<SumBackward0>)
tensor(3.0994e-06, grad_fn=<SumBackward0>)
tensor(9.0599e-06, grad_fn=<SumBackward0>)


## Try

[:, None, :]

vs 

.unsqueeze(dim=1) 

timing difference?

In [13]:
import torch
import time

x = torch.rand((100,100))


start = time.time()
for i in range(1000):
    y = x[:,None,:]

end = time.time()
print(end-start)

0.005891323089599609


In [14]:
import torch
import time

x = torch.rand((100,100))


start = time.time()
for i in range(1000):
    y = x.unsqueeze(dim=1)

end = time.time()
print(end-start)

0.0027556419372558594


In [15]:
x.unsqueeze(dim=1).shape

torch.Size([100, 1, 100])

In [17]:
(x.unsqueeze(dim=1) == x[:,None,:]).all()

tensor(True)