In [None]:
import pycuda.autoinit

from pycuda.tools import make_default_context
make_default_context().get_device().name()

'Tesla T4'

In [49]:
import numpy as np
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools
import time

import pycuda.autoinit

MATRIX_SIZE = 300

def matmul_GPU(a_gpu,b_gpu,MATRIX_SIZE=MATRIX_SIZE):
    kernel_code_template = """
    __global__ void MatrixMulKernel(float *A, float *B, float *C)
    {

      const uint wA = %(MATRIX_SIZE)s;
      const uint wB = %(MATRIX_SIZE)s;

      const uint bx = blockIdx.x;
      const uint by = blockIdx.y;

      const uint tx = threadIdx.x;
      const uint ty = threadIdx.y;

      const uint aBegin = wA * %(BLOCK_SIZE)s * by;
      const uint aEnd = aBegin + wA - 1;

      const uint aStep = %(BLOCK_SIZE)s;

      const uint bBegin = %(BLOCK_SIZE)s * bx;
      const uint bStep = %(BLOCK_SIZE)s * wB;

      float Csub = 0;

      for (int a = aBegin, b = bBegin;
           a <= aEnd;
           a += aStep, b += bStep)
        {

          __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

          __shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

          As[ty][tx] = A[a + wA * ty + tx];
          Bs[ty][tx] = B[b + wB * ty + tx];

          __syncthreads();
          for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
            Csub += As[ty][k] * Bs[k][tx];
          __syncthreads();
        }

      const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
      C[c + wB * ty + tx] = Csub;
    }
    """

    TILE_SIZE = 2
    BLOCK_SIZE = TILE_SIZE

    kernel_code = kernel_code_template % {
        'MATRIX_SIZE': MATRIX_SIZE,
        'BLOCK_SIZE': BLOCK_SIZE,
        }

    mod = compiler.SourceModule(kernel_code)
    
    c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

    matrixmul = mod.get_function("MatrixMulKernel")

    matrixmul(
        a_gpu, b_gpu,
        c_gpu,
        grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
        block = (TILE_SIZE, TILE_SIZE, 1),
        )

    return c_gpu


def matmul_CPU(matrix1, matrix2):
    rmatrix = np.zeros(shape=(matrix1.shape[0], matrix2.shape[1]))
    for i in range(len(matrix1)):
        for j in range(len(matrix2[0])):
            for k in range(len(matrix2)):
                rmatrix[i][j] += matrix1[i][k] * matrix2[k][j]
    return rmatrix

Замер времени умножения матриц 128x128, 256x256, 512x512 и 1024x1024




In [50]:
cpu_time = []
gpu_time = []
diffs = []

for size in [128, 256, 512, 1024]:
    a_cpu = np.random.randn(size, size).astype(np.float32)
    b_cpu = np.random.randn(size, size).astype(np.float32)

    print("Размерность матрицы:", size)
    
    startCPU = time.time()
    c_cpu = matmul_CPU(a_cpu, b_cpu)
    endCPU = time.time()
    timeCPU = endCPU -startCPU

    print("время на CPU:", timeCPU)
    cpu_time.append(timeCPU)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)

    startGPU = time.time()
    c_gpu = matmul_GPU(a_gpu, b_gpu, size)
    endGPU = time.time()
    timeGPU = endGPU-startGPU

    print("время на GPU:", timeGPU)
    gpu_time.append(timeGPU)

    differensetime = timeCPU-timeGPU
    print("CPU-GPU:", differensetime)
    diffs.append(differensetime)
    
    print ("___________________________")

Размерность матрицы: 128
время на CPU: 2.440417528152466
время на GPU: 0.30411243438720703
CPU-GPU: 2.136305093765259
___________________________
Размерность матрицы: 256
время на CPU: 19.03043556213379
время на GPU: 0.2987957000732422
CPU-GPU: 18.731639862060547
___________________________
Размерность матрицы: 512
время на CPU: 156.36971473693848
время на GPU: 0.3252756595611572
CPU-GPU: 156.04443907737732
___________________________
Размерность матрицы: 1024
время на CPU: 1287.555109500885
время на GPU: 0.47460508346557617
CPU-GPU: 1287.0805044174194
___________________________


In [51]:
import pandas as pd

pd.DataFrame({'Время на CPU': cpu_time, 'Время на GPU': gpu_time, 'Разница': diffs}, index=[128, 256, 512, 1024])

Unnamed: 0,Время на CPU,Время на GPU,Разница
128,2.440418,0.304112,2.136305
256,19.030436,0.298796,18.73164
512,156.369715,0.325276,156.044439
1024,1287.55511,0.474605,1287.080504
