In [13]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import time

# CUDA code for summing vector elements
vector_sum_kernel = """
__global__ void addKernel(int* result, int* a, unsigned int size) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    // Variable for local sum in block
    __shared__ int sharedSum[256];  // block size (can be changed)

    int localSum = 0;

    // Sum elements
    if (index < size) {
        localSum = a[index];
    }

    // Write local sum to shared memory
    sharedSum[threadIdx.x] = localSum;
    __syncthreads();  // Thread synchronization

    // Perform reduction in shared memory
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (threadIdx.x < stride) {
            sharedSum[threadIdx.x] += sharedSum[threadIdx.x + stride];
        }
        __syncthreads();
    }

    // Write block result to global memory
    if (threadIdx.x == 0) {
        atomicAdd(result, sharedSum[0]);  // Use atomic for safe result increment
    }
}
"""


# Function for summing vector elements on GPU using CUDA
def vector_sum_gpu(vector):
    '''
    # Function for summing vector elements on GPU
    :param: vector - input vector
    :return: [
      answer - multiplication result,
    ]
    '''

    start_time = time.time()

    # Allocate memory on GPU
    vector_gpu = cuda.mem_alloc(vector.nbytes)
    result_gpu = cuda.mem_alloc(np.int32().nbytes)  # Memory for result
    initial_value = np.array([0], dtype=np.int32)

    # Initialize result
    cuda.memcpy_htod(result_gpu, initial_value)

    # Copy data to GPU
    cuda.memcpy_htod(vector_gpu, vector)

    # Compile and load CUDA code
    mod = SourceModule(vector_sum_kernel)
    vector_sum = mod.get_function("addKernel")

    # Define block and grid sizes for parallelization
    block_size = 256
    grid_size = (len(vector) + block_size - 1) // block_size

    # Launch kernel on GPU
    vector_sum(result_gpu, vector_gpu, np.int32(len(vector)), block=(block_size, 1, 1), grid=(grid_size, 1))

    # Copy result from GPU to CPU
    result = np.empty(1, dtype=np.int32)
    cuda.memcpy_dtoh(result, result_gpu)

    end_time = time.time()

    return result[0], end_time - start_time


def vector_sum_cpu(vector):
    '''
    Function for summing vector elements on CPU
    :param: vector - input vector
    :return: [
      answer - multiplication result,
    ]
    '''
    answer = 0
    for elem in vector:
        answer += elem
    return answer


if __name__ == "__main__":
    # Generate vector with random values
    vector_size = 8194000
    vector = np.random.randint(1, 10, size=vector_size, dtype=np.int32)

    # Sum on CPU
    start_time_cpu = time.time()
    answer_cpu = vector_sum_cpu(vector)
    end_time_cpu = time.time()

    time_cpu = end_time_cpu - start_time_cpu
    print(f"Sum on CPU: {answer_cpu}, Execution time on CPU: {time_cpu} seconds")

    # Sum on GPU
    answer_gpu, time_gpu = vector_sum_gpu(vector)
    print(f"Sum on GPU: {answer_gpu}, Execution time on GPU: {time_gpu} seconds")

Sum on CPU: 40969262, Execution time on CPU: 0.6395096778869629 seconds
Sum on GPU: 40969262, Execution time on GPU: 0.020003557205200195 seconds
