https://cupy.dev

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import cupy as cp
import time 

In [2]:
cp.__version__

'9.5.0'

In [3]:
cp.cuda.runtime.driverGetVersion(), cp.cuda.runtime.runtimeGetVersion()

(11040, 11040)

In [4]:
cp.cuda.runtime.getDevice()

0

In [5]:
cp.cuda.runtime.getDeviceProperties(0)

{'name': b'NVIDIA GeForce GTX 1060 6GB',
 'totalGlobalMem': 6370295808,
 'sharedMemPerBlock': 49152,
 'regsPerBlock': 65536,
 'warpSize': 32,
 'maxThreadsPerBlock': 1024,
 'maxThreadsDim': (1024, 1024, 64),
 'maxGridSize': (2147483647, 65535, 65535),
 'clockRate': 1847500,
 'totalConstMem': 65536,
 'major': 6,
 'minor': 1,
 'textureAlignment': 512,
 'texturePitchAlignment': 32,
 'multiProcessorCount': 10,
 'kernelExecTimeoutEnabled': 1,
 'integrated': 0,
 'canMapHostMemory': 1,
 'computeMode': 0,
 'maxTexture1D': 131072,
 'maxTexture2D': (131072, 65536),
 'maxTexture3D': (16384, 16384, 16384),
 'concurrentKernels': 1,
 'ECCEnabled': 0,
 'pciBusID': 1,
 'pciDeviceID': 0,
 'pciDomainID': 0,
 'tccDriver': 0,
 'memoryClockRate': 4004000,
 'memoryBusWidth': 192,
 'l2CacheSize': 1572864,
 'maxThreadsPerMultiProcessor': 2048,
 'isMultiGpuBoard': 0,
 'cooperativeLaunch': 1,
 'cooperativeMultiDeviceLaunch': 1,
 'deviceOverlap': 1,
 'maxTexture1DMipmap': 16384,
 'maxTexture1DLinear': 268435456,


---

In [6]:
### Numpy and CPU
s = time.time()
x_cpu = np.ones((500,1000,1000))
e = time.time()
print(e - s)

0.4086003303527832


In [7]:
### CuPy and GPU
s = time.time()
x_gpu = cp.ones((500,1000,1000))
cp.cuda.Stream.null.synchronize()
e = time.time()
print(e - s)

0.2104206085205078


In [8]:
del x_gpu

----

In [9]:
from numpy import random as rng

In [10]:
%%time

x_cpu = rng.normal(size=(10_000,5_000))
y_cpu = rng.uniform(size=(20_000,5_000))

CPU times: user 1.64 s, sys: 53 ms, total: 1.69 s
Wall time: 1.69 s


In [11]:
%%time

### Numpy and CPU
r = x_cpu.dot(y_cpu.T)

CPU times: user 40.9 s, sys: 646 ms, total: 41.5 s
Wall time: 10.8 s


---

In [12]:
%%time

x_gpu = cp.array(x_cpu)
y_gpu = cp.array(y_cpu)

CPU times: user 155 ms, sys: 467 ms, total: 623 ms
Wall time: 450 ms


In [13]:
%%time

### CuPy and GPU
r = x_gpu.dot(y_gpu.T)
# cp.cuda.Stream.null.synchronize()

CPU times: user 245 ms, sys: 60.1 ms, total: 305 ms
Wall time: 306 ms


In [14]:
del x_gpu
del y_gpu
del r