# PyCLA

This notebook contains a quickstart for the `pycla` library.

In [1]:
# Core entities
from pycla import Vector, Matrix

# Utilities for CUDA device information
from pycla.core.cuda_device import Devices

# Contexts for intensive computation
from pycla.core import ShareDestionationVector, ShareDestionationMatrix

## Vector/Matrix Instantiation

In [2]:
# Vector and Matrices can be instantiated directly from Python lists/sequences
vector = Vector([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
matrix = Matrix([[1, 2, 3, 4], [1, 2, 3, 4]])

In [3]:
# All classes implement the __repr__ function, which provides a human-readable
#   format.
vector

Vector([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dims=10, device=CPU)

In [4]:
matrix

Matrix([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], dims=(2, 4), device=CPU)

## CUDA Management

In [5]:
# The `Devices` class implements a Singleton that
#  exposes all available CUDA devices
devices = Devices()
devices.has_cuda

True

In [6]:
# We can easily find the device count
devices.count

1

In [7]:
# Or list all available devices
list(devices)

[CUDADevice(id=0, name='NVIDIA GeForce GTX 1650', max_grid=(2147483647, 65535, 65535), max_block=(1024, 1024, 64), max_threads_per_block=1024)]

In [8]:
# Devices can be acessed by id or name
devices[0]

CUDADevice(id=0, name='NVIDIA GeForce GTX 1650', max_grid=(2147483647, 65535, 65535), max_block=(1024, 1024, 64), max_threads_per_block=1024)

In [9]:
# All devices contain basic information about the GPU
devices['NVIDIA GeForce GTX 1650']

CUDADevice(id=0, name='NVIDIA GeForce GTX 1650', max_grid=(2147483647, 65535, 65535), max_block=(1024, 1024, 64), max_threads_per_block=1024)

In [10]:
# Vector and Matrices can be moved forth and back to the GPU with the `.to(...)` and `.cpu()` methods
# Once an object is on the GPU, we cannot directly read its data from the CPU,
#    however we can still retrieve its metadata (i.e., shape, device name, etc)
vector.to(0)

Vector([<gpu>], dims=10, device=CUDADevice(id=0, name="NVIDIA GeForce GTX 1650"))

In [11]:
matrix.to(0)

Matrix([<gpu>], dims=(2, 4), device=CUDADevice(id=0, name="NVIDIA GeForce GTX 1650"))

In [12]:
# We can bring an object back to the CPU with either
#   .to(None) or .cpu() calls
vector.to(None)

Vector([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dims=10, device=CPU)

In [13]:
matrix.cpu()

Matrix([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], dims=(2, 4), device=CPU)

## Vector Operations

In [14]:
# The Vector class overrides the built-in operators
#  of Python. Most of the time, the result of an operation
#  return a new Vector instead of updating the current one
#  in place.
result = vector ** 2
print(Vector.has_shared_data(result, vector))
result

False


Vector([1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0], dims=10, device=CPU)

In [15]:
result + vector

Vector([2.0, 6.0, 12.0, 20.0, 30.0, 42.0, 56.0, 72.0, 90.0, 110.0], dims=10, device=CPU)

In [16]:
result - vector

Vector([0.0, 2.0, 6.0, 12.0, 20.0, 30.0, 42.0, 56.0, 72.0, 90.0], dims=10, device=CPU)

In [17]:
# The mul operator (*) implements element-wise or multiplication by scalar
2 * vector

Vector([2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0], dims=10, device=CPU)

In [18]:
result * vector

Vector([1.0, 8.0, 27.0, 64.0, 125.0, 216.0, 343.0, 512.0, 729.0, 1000.0], dims=10, device=CPU)

In [19]:
# The matmul operator (@) implements the dot-product
vector @ vector

385.0

In [20]:
# The Vector class also exposes norms and some utilities
vector.l2()

19.621416870348583

In [21]:
vector.max()

10.0

In [22]:
vector.lp(3.0)

14.462447418811122

In [23]:
# Orthogonality check
Vector([2.0, 0.0]).is_orthogonal(Vector([0.0, 2.0]))

True

In [24]:
# We can retrieve the angle between two vectors in rad/deg
Vector([2.0, 0.0]).angle_to(Vector([0.0, 2.0]), unit="deg")

90.0

In [25]:
# We can also directly release the memory allocated
#  for a vector with
vector.release()
del vector

## Matrix Operations

In [26]:
# In the same way we can operate on vectors,
#  the Matrix class exposes similar functionality.
matrix * 2

Matrix([[2.0, 4.0, 6.0, 8.0], [2.0, 4.0, 6.0, 8.0]], dims=(2, 4), device=CPU)

In [27]:
matrix - matrix

Matrix([[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], dims=(2, 4), device=CPU)

In [28]:
# The matmul operator implements Matrix multiplication
Matrix([[2.0, 2.0], [2.0, 2.0]]) @ Matrix([[2.0], [2.0]])

Matrix([[8.0], [8.0]], dims=(2, 1), device=CPU)

In [29]:
# We can retrieve the trace for square matrices
Matrix([[2.0, 2.0], [0.0, -1.0]]).trace()

1.0

In [30]:
# The same goes for norms
matrix.frobenius()

7.745966692414834

In [31]:
# We can also directly release the memory allocated
#  for a Matrix with:
matrix.release()
del matrix

# Avoiding Vector/Matrix allocation

In [32]:
# Whenever we apply an operation on a Vector/Matrix,
#   a new object is allocated in memory to store the result.
# The only exception are the 'i' operations (i.e., *=, +=, -=, etc),
#   which edit the object in place.
# However, for some extensive computation, it is desirable to
#   waste as little memory and time as possible. Thus, the
#   ShareDestination{Vector,Matrix} contexts allow for using
#   a single shared object for most operation with vectors and matrices.
a = Vector([1.0] * 10)
b = Vector([2.0] * 10)
with ShareDestionationVector(a, b) as result:
    op1 = a + b
    op2 = result * 2
    op3 = result / 2

# All op1, op2 and op3 vectors represent the
#  same vector.
result

Vector([3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], dims=10, device=CPU)

In [33]:
Vector.has_shared_data(op1, result)

True

In [34]:
Vector.has_shared_data(op2, result)

True

In [35]:
Vector.has_shared_data(op3, result)

True