In [9]:
import pycuda.autoinit

from pycuda.tools import make_default_context
make_default_context().get_device().name()

'Tesla T4'

In [15]:
import numpy as np
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools
import time

# -- initialize the device
import pycuda.autoinit

MATRIX_SIZE = 300

def matmul(a_gpu,b_gpu,MATRIX_SIZE=MATRIX_SIZE):
    kernel_code_template = """
    __global__ void MatrixMulKernel(float *A, float *B, float *C)
    {

      const uint wA = %(MATRIX_SIZE)s;
      const uint wB = %(MATRIX_SIZE)s;

      const uint bx = blockIdx.x;
      const uint by = blockIdx.y;

      const uint tx = threadIdx.x;
      const uint ty = threadIdx.y;

      const uint aBegin = wA * %(BLOCK_SIZE)s * by;
      const uint aEnd = aBegin + wA - 1;

      const uint aStep = %(BLOCK_SIZE)s;

      const uint bBegin = %(BLOCK_SIZE)s * bx;
      const uint bStep = %(BLOCK_SIZE)s * wB;

      float Csub = 0;

      for (int a = aBegin, b = bBegin;
           a <= aEnd;
           a += aStep, b += bStep)
        {

          __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

          __shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

          As[ty][tx] = A[a + wA * ty + tx];
          Bs[ty][tx] = B[b + wB * ty + tx];

          __syncthreads();
          for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
            Csub += As[ty][k] * Bs[k][tx];
          __syncthreads();
        }

      const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
      C[c + wB * ty + tx] = Csub;
    }
    """

    TILE_SIZE = 2
    BLOCK_SIZE = TILE_SIZE

    kernel_code = kernel_code_template % {
        'MATRIX_SIZE': MATRIX_SIZE,
        'BLOCK_SIZE': BLOCK_SIZE,
        }

    mod = compiler.SourceModule(kernel_code)
    
    c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

    matrixmul = mod.get_function("MatrixMulKernel")

    matrixmul(
        a_gpu, b_gpu,
        c_gpu,
        # Сетка для блоков
        grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
        # Блок из тредов
        block = (TILE_SIZE, TILE_SIZE, 1),
        )

    return c_gpu


def matmul_CPU(matrix1, matrix2):
    rmatrix = np.zeros(shape=(matrix1.shape[0], matrix2.shape[1]))
    for i in range(len(matrix1)):
        for j in range(len(matrix2[0])):
            for k in range(len(matrix2)):
                rmatrix[i][j] += matrix1[i][k] * matrix2[k][j]
    return rmatrix

Замер времени умножения матриц 128x128, 256x256, 512x512 и 1024x1024




In [17]:
cpu_time = []
gpu_time = []
diffs = []

for size in [128, 256, 512, 1024]:
    a_cpu = np.random.randn(size, size).astype(np.float32)
    b_cpu = np.random.randn(size, size).astype(np.float32)

    start=time.clock()
    c_cpu = matmul_CPU(a_cpu, b_cpu)
    end = time.clock()
    print("size:", size)
    
    time_ = end - start
    cpu_time.append(time_)
    print("CPU time:", time_)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)

    start = time.clock()
    c_gpu = matmul(a_gpu, b_gpu, size)
    end = time.clock()

    time_ = end - start
    gpu_time.append(time_)
    print("GPU time:", time_)

    print ("-" * 80)
    print ("CPU-GPU difference:")

    diff = sum(sum(c_cpu - c_gpu.get()))
    diffs.append(diff)
    print(diff)

size: 128
CPU time: 2.598510000000033
GPU time: 0.002058000000033644
--------------------------------------------------------------------------------
CPU-GPU difference:
9.301743054379585e-05
size: 256
CPU time: 20.652914999999894
GPU time: 0.005116000000043641
--------------------------------------------------------------------------------
CPU-GPU difference:
-0.001011993295183089
size: 512
CPU time: 165.90851799999996
GPU time: 0.022860000000036962
--------------------------------------------------------------------------------
CPU-GPU difference:
-0.006009181734427216
size: 1024
CPU time: 1285.541839
GPU time: 0.18476400000008653
--------------------------------------------------------------------------------
CPU-GPU difference:
0.014015535383305355


In [23]:
import pandas as pd

pd.DataFrame({'CPU time': cpu_time, 'GPU time': gpu_time, 'Difference': diffs}, index=[128, 256, 512, 1024])

Unnamed: 0,CPU time,GPU time,Difference
128,2.59851,0.002058,9.3e-05
256,20.652915,0.005116,-0.001012
512,165.908518,0.02286,-0.006009
1024,1285.541839,0.184764,0.014016


### Разница во времени

Матрица 128x128

In [25]:
cpu_time[0] - gpu_time[0]

2.5964519999999993

Матрица 256x256

In [26]:
cpu_time[1] - gpu_time[1]

20.64779899999985

Матрица 512x512

In [27]:
cpu_time[2] - gpu_time[2]

165.88565799999992

Матрица 1024x1024

In [28]:
cpu_time[3] - gpu_time[3]

1285.357075