# CUDA Exercise 04
> Matrix summation example on GPU, only applied with single thread.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_04.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
# !pip install git+git://github.com/depctg/nvcc4jupyter.git
# %load_ext nvcc_plugin
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
# Check the environment
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp5cnsj9t5".
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sat Jun  7 03:10:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persiste

## Matrix Summation

In [3]:
%%cuda
#include <stdio.h>
#include <assert.h>

#define M 10
#define N 10
#define MAX_ERR 1e-4

__global__ void matrix_summation(float* out, float *a, float *b, int m, int n)
{
  int index;
  for(int i = 0; i < m; i++)
  {
      for(int j = 0; j < n; j++)
      {
          index = i*n+j;
          out[index] = a[index] + b[index];
      }
  }
}

int main()
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    a = (float*)malloc(sizeof(float) * (M * N));
    b = (float*)malloc(sizeof(float) * (M * N));
    out = (float*)malloc(sizeof(float) * (M * N));

    // data initializtion
    for(int i = 0; i < M; i++)
    {
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            a[index] = i*3.14f;
            b[index] = j;
        }
    }
    printf("a[12] = %f\n", a[12]);
    printf("b[12] = %f\n", b[12]);

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_b, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_out, sizeof(float) * (M * N));

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * (M * N), cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    matrix_summation<<<1,1>>>(d_out, d_a, d_b, M, N);

    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * (M * N),
               cudaMemcpyDeviceToHost);

    // Test the result
    for(int i = 0; i < M; i++)
    {
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            //assert(fabs(out[index] - a[index] - b[index]) < MAX_ERR);
        }
    }
    printf("out[12] = %f\n", out[12]);
    printf("PASSED\n");

    cudaDeviceSynchronize();
    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);

    return 0;
}

a[12] = 3.140000
b[12] = 2.000000
out[12] = 0.000000
PASSED

