# Benchmark: dot product

# Benchmark operation on a small rank three tensor.

In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def r3py(a):
    """ Take a rank 3 python tensor and set all values to one. """
    I, J, T = len(a[0][0]), len(a[0]), len(a)
    for i in range(I):
        for j in range(J):
            for t in range(T):
                a[t][j][i] = 1

In [None]:
a = [[[0 for _ in range(1024)] for _ in range(1024)] for _ in range(10)]

In [None]:
sys.getsizeof(a[0][0]) * len(a[0]) * len(a)

Around 8 bytes per entry for a pointer. An integer takes up 28 bytes.

In [None]:
sys.getsizeof(0), sys.getsizeof(1), sys.getsizeof(2**3), sys.getsizeof(2**8), sys.getsizeof(sys.maxsize)

In [None]:
s = pd.Series([sys.getsizeof(2**i) for i in range(0, 128, 1)])

In [None]:
ax = s.plot(kind='line', style='b-', grid=True, title='integers in memory')
ax.set_xlabel('2^n')
ax.set_ylabel('sizeof')
plt.plot()

The 1024x1024x10 pure python array takes up at least 92405760 * 28 bytes (around 2G) in memory.
Let's set the value of each element to some value.

In [None]:
%timeit r3py(a) # 2.22 s ± 45.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

How about floats?

> One of the major challenges in writing (somewhat) large-scale Python programs is to keep memory usage at a minimum.

In [None]:
sys.getsizeof(2.0**1023)

In [None]:
sys.float_info

### Numpy version

In [None]:
import numpy as np

In [None]:
ar = np.zeros((1024, 1024, 10))

In [None]:
ar.nbytes

In [None]:
ar.dtype

The numpy version is much more compact: 83886080 bytes (around 80M). We can set an explicit type.

In [None]:
ar = np.zeros((1024, 1024, 10), dtype='uint8')

In [None]:
ar.nbytes

Numpy float information.

In [None]:
pd.DataFrame([(np.finfo(f).dtype, np.finfo(f).nexp, np.finfo(f).nmant, np.finfo(f).min, np.finfo(f).max)
              for f in (np.float32, np.float64, np.float128, float)], columns=['name', 'nexp', 'nmant', 'min', 'max'])

In [None]:
# Let work with ints.

In [None]:
ar = np.zeros((1024, 1024, 10), dtype='uint8')

In [None]:
%timeit ar + 1 # 4.83 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

# PyTorch and GPU

In [None]:
import torch

In [None]:
torch.IntTensor(5, 3).zero_()

In [None]:
torch.IntTensor(5, 3).zero_() + 1

In [None]:
t = torch.IntTensor(1024, 1024, 10).zero_()

In [None]:
%timeit t + 1 # 19.4 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
# t.cuda?

> Returns a copy of this object in CUDA memory.

> If this object is already in CUDA memory and on the correct device, then no copy is performed and the original object is returned.

In [None]:
if torch.cuda.device_count():
    gt = t.cuda()
else:
    print("GPU? 乁( ◔ ౪◔)「      ┑(￣Д ￣)┍ ")
    pass # AssertionError: Torch not compiled with CUDA enabled

Note: Next cells will **only** work on **GPU**.

In [None]:
gt = t.cuda()

In [None]:
type(gt)

In [None]:
gt.size()

In [None]:
gt.get_device()

In [None]:
gt.element_size()

In [None]:
gt.is_cuda

In [None]:
%timeit gt + 1 # 2.42 ms ± 8.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [None]:
one = torch.IntTensor([1])

In [None]:
one

In [None]:
one.is_cuda

In [None]:
oneg = one.cuda()

In [None]:
oneg.is_cuda

In [None]:
%timeit gt + oneg # 2.5 ms ± 7.31 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)