<a href="https://colab.research.google.com/github/maomaodedipan/GPU/blob/main/Assignment2_question2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/DD2360/Assignment2/question2

/content/drive/MyDrive/DD2360/Assignment2/question2


In [None]:
%%writefile vectorMultiplication.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define DataType double

void initializeMatrix(DataType *matrix, int rows, int columns) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < columns; ++j) {
            matrix[i * columns + j] = (DataType)(rand()) / RAND_MAX;
        }
    }
}

void matrixMultiplicationCPU(DataType *A, DataType *B, DataType *C, int numRowsA, int numColsA, int numColsB) {
    for (int i = 0; i < numRowsA; ++i) {
        for (int j = 0; j < numColsB; ++j) {
            DataType sum = 0.0;
            for (int k = 0; k < numColsA; ++k) {
                sum += A[i * numColsA + k] * B[k * numColsB + j];
            }
            C[i * numColsB + j] = sum;
        }
    }
}

bool areMatricesEqual(DataType *matrix1, DataType *matrix2, int numRows, int numCols) {
    for (int i = 0; i < numRows; ++i) {
        for (int j = 0; j < numCols; ++j) {
            if (fabs(matrix1[i * numCols + j] - matrix2[i * numCols + j]) > 1e-5 ) {
              printf("matrix1: %f,matrix2: %f",matrix1[i * numCols + j],matrix2[i * numCols + j]);
                return false;
            }
        }
    }
    return true;
}

__global__ void gemm(DataType *A, DataType *B, DataType *C, int numARows,
                      int numAColumns, int numBRows, int numBColumns) {
    //@@ Insert code to implement matrix multiplication here
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numARows && col < numBColumns) {
        DataType sum = 0.0;
        for (int i = 0; i < numAColumns; ++i) {
            sum += A[row * numAColumns + i] * B[i * numBColumns + col];
        }
        C[row * numBColumns + col] = sum;
    }
}

double CPUtimer(){
  struct timeval ti;
  gettimeofday(&ti,NULL);
  return ((double)ti.tv_sec + (double)ti.tv_usec * 1e-6);
}


int main(int argc, char **argv) {

    DataType *hostA;       // The A matrix
    DataType *hostB;       // The B matrix
    DataType *hostC;       // The output C matrix
    DataType *resultRef;   // The reference result
    DataType *deviceA;
    DataType *deviceB;
    DataType *deviceC;
    int numARows;          // number of rows in the matrix A
    int numAColumns;       // number of columns in the matrix A
    int numBRows;          // number of rows in the matrix B
    int numBColumns;       // number of columns in the matrix B
    int numCRows;
    int numCColumns;
    double start,end,duration;

    //@@ Insert code below to read in numARows, numAColumns, numBRows, numBColumns from args
    if (argc != 5) {
        printf("argument doesn't match");
        exit(EXIT_FAILURE);
    }

    numARows = atoi(argv[1]);
    numAColumns = atoi(argv[2]);
    numBRows = atoi(argv[3]);
    numBColumns = atoi(argv[4]);

    if (numAColumns != numBRows) {
        printf("Dimension doesn't match!");
        exit(EXIT_FAILURE);
    }

    numCRows = numARows;
    numCColumns = numBColumns;

    printf("Input matrix dim (%d x %d) (%d x %d) (%d x %d)\n", numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

    //@@ Insert code below to allocate Host memory for input and output
    hostA = (DataType *)malloc(numARows * numAColumns * sizeof(DataType));
    hostB = (DataType *)malloc(numBRows * numBColumns * sizeof(DataType));
    hostC = (DataType *)malloc(numCRows * numCColumns * sizeof(DataType));
    resultRef = (DataType *)malloc(numCRows * numCColumns * sizeof(DataType));

    //@@ Insert code below to initialize hostA and hostB to random numbers, and create reference result in CPU
    initializeMatrix(hostA, numARows, numAColumns);
    initializeMatrix(hostB, numBRows, numBColumns);
    matrixMultiplicationCPU(hostA, hostB, resultRef, numARows, numAColumns, numBColumns);

    //@@ Insert code below to allocate GPU memory here
    cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(DataType));
    cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(DataType));
    cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(DataType));

    //@@ Insert code to below to Copy memory to the GPU here
    start = CPUtimer();
    cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(DataType), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(DataType), cudaMemcpyHostToDevice);
    end = CPUtimer();
    duration = end - start;
    printf("copy time(H2D): %f.\n", duration);


    //@@ Initialize the grid and block dimensions here
    dim3 blockDim(64, 64); // You may adjust the block size as needed
    dim3 gridDim((numBColumns + blockDim.x - 1) / blockDim.x, (numARows + blockDim.y - 1) / blockDim.y);

    //@@ Launch the GPU Kernel here
    start = CPUtimer();
    gemm<<<gridDim, blockDim>>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns);
    cudaDeviceSynchronize();
    end = CPUtimer();
    duration = end - start;
    printf("kernel time: %f.\n", duration);

    //@@ Copy the GPU memory back to the CPU here
    start = CPUtimer();
    cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(DataType), cudaMemcpyDeviceToHost);
    end = CPUtimer();
    duration = end - start;
    printf("copy time(D2H): %f.\n", duration);

    //@@ Insert code below to compare the output with the reference
    if(areMatricesEqual(resultRef, hostC, numCRows, numCColumns)){
      printf("The result is euqal");
    }else{
      printf("The result is not euqal");
    }

    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);

    //@@ Free the CPU memory here
    free(hostA);
    free(hostB);
    free(hostC);
    free(resultRef);

    return 0;
}




Overwriting vectorMultiplication.cu


In [None]:
!nvcc vectorMultiplication.cu
!ls

a.out  vectorMultiplication.cu


In [None]:
!./a.out 20 10 10 20

Input matrix dim (20 x 10) (10 x 20) (20 x 20)
copy time(H2D): 0.000276.
kernel time: 0.000036.
copy time(D2H): 0.000022.
The result is euqal

In [None]:
!./a.out 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
copy time(H2D): 0.000575.
kernel time: 0.000092.
copy time(D2H): 0.000131.
The result is euqal

In [None]:
!./a.out 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
copy time(H2D): 0.008174.
kernel time: 0.047718.
copy time(D2H): 0.011278.
The result is euqal

In [None]:
!ncu --set default --metrics sm__warps_active.avg.pct_of_peak_sustained_active ./a.out 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
==PROF== Connected to process 6931 (/content/drive/MyDrive/DD2360/Assignment2/question2/a.out)
copy time(H2D): 0.008287.
==PROF== Profiling "gemm" - 0: 0%....50%....100% - 8 passes
kernel time: 0.670145.
copy time(D2H): 0.012505.
==PROF== Disconnected from process 6931
The result is euqal[6931] a.out@127.0.0.1
  gemm(double *, double *, double *, int, int, int, int), 2023-Nov-26 16:04:25, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    sm__warps_active.avg.pct_of_peak_sustained_active                                    %                          99.18
    ---------------------------------------------------------------------- --------------- ------------------------------

    Section: GPU Speed Of Light Throughput
    ---------------------------------------------------------------------- ---

In [None]:
!./a.out 16 16 16 16

Input matrix dim (16 x 16) (16 x 16) (16 x 16)
copy time(H2D): 0.000324.
kernel time: 0.000031.
copy time(D2H): 0.000018.
The result is euqal

In [None]:
!./a.out 32 32 32 32

Input matrix dim (32 x 32) (32 x 32) (32 x 32)
copy time(H2D): 0.000292.
kernel time: 0.000036.
copy time(D2H): 0.000023.
The result is euqal

In [None]:
!./a.out 64 64 64 64

Input matrix dim (64 x 64) (64 x 64) (64 x 64)
copy time(H2D): 0.000755.
kernel time: 0.000054.
copy time(D2H): 0.000063.
The result is euqal

In [None]:
!./a.out 256 256 256 256

Input matrix dim (256 x 256) (256 x 256) (256 x 256)
copy time(H2D): 0.000591.
kernel time: 0.000448.
copy time(D2H): 0.000471.
The result is euqal

In [None]:
!./a.out 512 512 512 512

Input matrix dim (512 x 512) (512 x 512) (512 x 512)
copy time(H2D): 0.001356.
kernel time: 0.000071.
copy time(D2H): 0.001617.
matrix1: 135.774959,matrix2: 0.000000The result is not euqal