# Performing SVD with Dask and CuPy

This is a communication heavy computation on GPUs. Currently this is faster on GPU than CPU. Though communication improvements on the UCX should improve this further.

## Use a DGX

In [None]:
import dask
import dask.config

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(diagnostics_port=9100)
client = Client(cluster)
client

## Create Random Dataset

This is done on GPU.

In [None]:
import dask
import dask.array
import cupy
import numpy


# CPU
# rs = dask.array.random.RandomState(RandomState=numpy.random.RandomState)
# GPU
rs = dask.array.random.RandomState(RandomState=cupy.random.RandomState)

x = rs.random((1000000, 1000), chunks=(10000, 1000))
x = x.persist()

## Add array with its transpose

This computes SVD on GPU and has some communication heavy steps.

In [None]:
import dask.array.linalg

u, s, v = dask.array.linalg.svd(x)

In [None]:
import time
start = time.time()

In [None]:
u, s, v = dask.persist(u, s, v)
%time _ = wait([u, s, v])

In [None]:
print(u.shape)
print(s.shape)
print(v.shape)

In [None]:
_ = client.profile(start=start, filename='dask-cupy-svd.html')

## Inspect output

In [None]:
print(u[:10, :10].compute())
print(s[:10].compute())
print(v[:10, :10].compute())

In [None]:
from distributed.utils import format_bytes

In [None]:
print(format_bytes(u.nbytes))
print(format_bytes(s.nbytes))
print(format_bytes(v.nbytes))