In [1]:
!/usr/local/cuda/bin/nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin
!cuda-install-samples-11.2.sh ~ & cd /root/NVIDIA_CUDA-11.2_Samples/
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-7hdt72o6
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-7hdt72o6
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=ee5132bb214ec78a626ed40f2dcc0bc16487f304d9eab641881fcedbc075cb51
  Stored in directory: /tmp/pip-ephem-wheel-cache-av9tg_vm/wheels/

In [6]:
%%cuda --name my_curand.cu
#include <cstdlib>
#include <curand.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <random>

#define IDX2C(i,j,ld) (((j)*(ld))+(i))

void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) {
	int lda=m,ldb=k,ldc=m;
	const double alf = 1;
	const double bet = 0;
	const double *alpha = &alf;
	const double *beta = &bet;

	cublasHandle_t handle;
	cublasCreate(&handle);

	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);

	cublasDestroy(handle);
}

void print_matrix(double* matrix, int rows, int cols) {
	for(int i = 0; i < 3; ++i){
		for(int j = 0; j < 3; ++j){
			printf("%f ", matrix[IDX2C(i, j, 3)]);
		}
		printf("\n");
	}
	printf("\n");

}

void GPU_fill_rand(double* A, int nr_rows_A, int nr_cols_A) {
    curandGenerator_t prng;
    curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
    curandGenerateUniformDouble(prng, A, nr_rows_A * nr_cols_A);
}

void consistent(const double* A, const double* B, double*C, const int m, const int k, const int n) {
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            C[IDX2C(i, j, n)] = 0.0;
            for (int r = 0; r < n; ++r) {
                C[IDX2C(i, j, n)] += A[IDX2C(i, r, k)] * B[IDX2C(r, j, n)];
            }
        }
    }
}


int main() {
		for (int n = 50;n<=2000;n*=2){
				int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;

		nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = n;
		double *h_A = (double *)malloc(nr_rows_A * nr_cols_A * sizeof(double));
		double *h_B = (double *)malloc(nr_rows_B * nr_cols_B * sizeof(double));
		double *h_C = (double *)malloc(nr_rows_C * nr_cols_C * sizeof(double));

		double *d_A, *d_B, *d_C;
		cudaMalloc(&d_A, nr_rows_A * nr_cols_A * sizeof(double));
		cudaMalloc(&d_B, nr_rows_B * nr_cols_B * sizeof(double));
		cudaMalloc(&d_C, nr_rows_C * nr_cols_C * sizeof(double));

		for (int i = 0; i < nr_rows_A * nr_rows_A; i++) {
				h_A[i] = i;
				h_B[i] = i+1;
			}
		cudaMemcpy(d_A, h_A, nr_rows_A * nr_cols_A * sizeof(double), cudaMemcpyHostToDevice);
		cudaMemcpy(d_B, h_B, nr_rows_B * nr_cols_B * sizeof(double), cudaMemcpyHostToDevice);

		srand(time(0));
		double begin1 = clock();
		gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
		double end1 = (clock() - begin1) / CLOCKS_PER_SEC;
		cudaMemcpy(h_C, d_C, nr_rows_C * nr_cols_C * sizeof(double), cudaMemcpyDeviceToHost);
		double *cpu_C = (double *)malloc(nr_rows_C * nr_cols_C * sizeof(double));
		double begin2 = clock();
		consistent(h_A, h_B, cpu_C, nr_rows_A, nr_cols_A, nr_cols_B);
		double end2 = (clock() - begin2) / CLOCKS_PER_SEC;

		printf("\nn = %d\n", n);
		printf("A:\n");
		print_matrix(h_A, nr_rows_A, nr_cols_A);
		printf("B:\n");
		print_matrix(h_B, nr_rows_B, nr_cols_B);
		printf("C(gpu):\n");
		print_matrix(h_C, nr_rows_C, nr_cols_C);
		printf("Time %f\n ", end1);
		printf("C(cpu):\n");
		print_matrix(cpu_C, nr_rows_C, nr_cols_C);
		printf("Time %f\n ", end2);
		cudaFree(d_A);
		cudaFree(d_B);
		cudaFree(d_C);
		free(h_A);
		free(h_B);
		free(h_C);
		free(cpu_C);
		}
		return 0;
}


'File written in /content/src/my_curand.cu'

In [7]:
!nvcc -o /content/src/my_curand /content/src/my_curand.cu -lcurand -lcublas
!/content/src/my_curand


n = 50
A:
0.000000 3.000000 6.000000 
1.000000 4.000000 7.000000 
2.000000 5.000000 8.000000 

B:
1.000000 4.000000 7.000000 
2.000000 5.000000 8.000000 
3.000000 6.000000 9.000000 

C(gpu):
2082500.000000 2086325.000000 2090150.000000 
2083775.000000 2087600.000000 2091425.000000 
2085050.000000 2088875.000000 2092700.000000 

Time 0.565709
 C(cpu):
2082500.000000 2086325.000000 2090150.000000 
2083775.000000 2087600.000000 2091425.000000 
2085050.000000 2088875.000000 2092700.000000 

Time 0.000591
 
n = 100
A:
0.000000 3.000000 6.000000 
1.000000 4.000000 7.000000 
2.000000 5.000000 8.000000 

B:
1.000000 4.000000 7.000000 
2.000000 5.000000 8.000000 
3.000000 6.000000 9.000000 

C(gpu):
33330000.000000 33345150.000000 33360300.000000 
33335050.000000 33350200.000000 33365350.000000 
33340100.000000 33355250.000000 33370400.000000 

Time 0.000623
 C(cpu):
33330000.000000 33345150.000000 33360300.000000 
33335050.000000 33350200.000000 33365350.000000 
33340100.000000 33355250.00000