In [15]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6adi1eno
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6adi1eno
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [176]:
%%writefile sequential.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void printMatrix(int *M, int r, int c);

int main() {

	const int n = 800;
	const int m = 800;
	const int k = 850;

	clock_t t;

	int a[n*k];
	int b[k*m];
	int c[n*m];

	for (int i = 0; i < n*k; i++) { a[i] = rand()%100; }
	for (int i = 0; i < k*m; i++) { b[i] = rand()%100; }

	// printMatrix(a, n, k);
	// printMatrix(b, k, m);

	t = clock();
	for (int i = 0; i < n; i++) {
		for (int j = 0; j < m; j++) {
			int cValue = 0;
			for (int l = 0; l < k; l++)
				cValue += a[i*k+l]*b[l*m+j];
			c[i*m+j] = cValue;
		}
	}
	t = clock()-t;
	double duration = (double)t/CLOCKS_PER_SEC;

	// printMatrix(c, k, m);
	printf("%.2f", duration);
	
	return 0;

}

void printMatrix(int *M, int r, int c) {
	for (int i = 0; i < r; i++) {
		for (int j = 0; j < c; j++) {
			printf("%d ", M[i*c+j]);
		}
		printf("\n");
	}
}


Overwriting sequential.c


In [177]:
!gcc -o sequential sequential.c
!./sequential

2.07

In [216]:
%%cuda --name CUDA.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

void printMatrix(int *M, int r, int c);

__global__ void matrixMultiplicationKernel(int *a, int *b, int *c, int n, int m, int k) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row < n && col < m) {
		int cValue = 0;
		for (int l = 0; l < k; l++)
			cValue += a[row*k+l]*b[l*m+col];
		c[row*m+col] = cValue;
	}
}

int main() {

	const int n = 800;
	const int m = 800;
	const int k = 850;

	clock_t t;

	int *ha = (int*)malloc(n*k*sizeof(int));
	int *hb = (int*)malloc(k*m*sizeof(int));
	int *hc = (int*)malloc(n*m*sizeof(int));

	for (int i = 0; i < n*k; i++) { ha[i] = rand()%100; }
	// for (int i = 0; i < n*k; i++) { ha[i] = 1; }
	for (int i = 0; i < k*m; i++) { hb[i] = rand()%100; }
	// for (int i = 0; i < k*m; i++) { hb[i] = 1; }

	// printMatrix(ha, n, k);
	// printMatrix(hb, k, m);

	t = clock();

	int *da; int *db; int *dc;
	
	cudaMalloc((void**)&da, n*k*sizeof(int));
	cudaMalloc((void**)&db, k*m*sizeof(int));
	cudaMalloc((void**)&dc, n*m*sizeof(int));

	cudaMemcpy(da, ha, n*k*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(db, hb, k*m*sizeof(int), cudaMemcpyHostToDevice);

	dim3 block(16, 16);
	dim3 grid((m + block.x - 1) / block.x, (n + block.y - 1) / block.y);

	matrixMultiplicationKernel<<<grid, block>>>(da, db, dc, n, m, k);

	cudaMemcpy(hc, dc, n*m*sizeof(int), cudaMemcpyDeviceToHost);

	// printMatrix(hc, n, m);

	cudaFree(da);
	cudaFree(db);
	cudaFree(dc);
	free(ha);
	free(hb);
	free(hc);

	t = clock()-t;
	double duration = (double)t/CLOCKS_PER_SEC;

	printf("%.2f", duration);
	
	return 0;

}

void printMatrix(int *M, int r, int c) {
	for (int i = 0; i < r; i++) {
		for (int j = 0; j < c; j++) {
			printf("%d ", M[i*c+j]);
		}
		printf("\n");
	}
}


'File written in /content/src/CUDA.cu'

In [217]:
!nvcc -o src/CUDA src/CUDA.cu
!./src/CUDA

0.10

In [196]:
%%cuda --name CUDA_TILED.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define tile_size 16

void printMatrix(int *M, int r, int c);

__global__ void matrixMultiplicationKernel(int *a, int *b, int *c, int n, int m, int k) {
    __shared__ int tileA[tile_size][tile_size];
    __shared__ int tileB[tile_size][tile_size];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row = blockIdx.y * tile_size + ty;
    int col = blockIdx.x * tile_size + tx;

    int cValue = 0;
    for (int t = 0; t < (k + tile_size - 1) / tile_size; t++) {
        if (row < n && t * tile_size + tx < k)
            tileA[ty][tx] = a[row * k + t * tile_size + tx];
        else
            tileA[ty][tx] = 0;
        if (col < m && t * tile_size + ty < k)
            tileB[ty][tx] = b[(t * tile_size + ty) * m + col];
        else
            tileB[ty][tx] = 0;

        __syncthreads();

        for (int i = 0; i < tile_size; ++i)
            cValue += tileA[ty][i] * tileB[i][tx];

        __syncthreads();
    }

    if (row < n && col < m) {
        c[row * m + col] = cValue;
    }
}

int main() {

	const int n = 800;
	const int m = 800;
	const int k = 850;

	clock_t t;

	int *ha = (int*)malloc(n*k*sizeof(int));
	int *hb = (int*)malloc(k*m*sizeof(int));
	int *hc = (int*)malloc(n*m*sizeof(int));

	// for (int i = 0; i < n*k; i++) { ha[i] = rand()%100; }
	// for (int i = 0; i < k*m; i++) { hb[i] = rand()%100; }
  for (int i = 0; i < n*k; i++) { ha[i] = 1; }
	for (int i = 0; i < k*m; i++) { hb[i] = 1; }

	// printMatrix(ha, n, k);
	// printMatrix(hb, k, m);

	t = clock();

	int *da; int *db; int *dc;
	
	cudaMalloc((void**)&da, n*k*sizeof(int));
	cudaMalloc((void**)&db, k*m*sizeof(int));
	cudaMalloc((void**)&dc, n*m*sizeof(int));

	cudaMemcpy(da, ha, n*k*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(db, hb, k*m*sizeof(int), cudaMemcpyHostToDevice);

	dim3 block(16, 16);
	dim3 grid((m + tile_size - 1) / tile_size, (n + tile_size - 1) / tile_size);

	matrixMultiplicationKernel<<<grid, block>>>(da, db, dc, n, m, k);

	cudaMemcpy(hc, dc, n*m*sizeof(int), cudaMemcpyDeviceToHost);

  // printMatrix(hc, n, m);

	cudaFree(da);
	cudaFree(db);
	cudaFree(dc);
	free(ha);
	free(hb);
	free(hc);

	t = clock()-t;
	double duration = (double)t/CLOCKS_PER_SEC;

	printf("%.2f", duration);
	
	return 0;

}

void printMatrix(int *M, int r, int c) {
	for (int i = 0; i < r; i++) {
		for (int j = 0; j < c; j++) {
			printf("%d ", M[i*c+j]);
		}
		printf("\n");
	}
}

'File written in /content/src/CUDA_TILED.cu'

In [199]:
!nvcc -o src/CUDA_TILED src/CUDA_TILED.cu
!./src/CUDA_TILED

0.11