In [20]:
!nvidia-smi

Tue Oct  8 05:47:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [21]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [22]:
!git clone https://github.com/NVIDIA/cuda-samples.git

fatal: destination path 'cuda-samples' already exists and is not an empty directory.


In [23]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && make

make: Nothing to be done for 'all'.


In [24]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && ls
!cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery

deviceQuery	 deviceQuery_vs2017.sln      deviceQuery_vs2019.vcxproj  Makefile
deviceQuery.cpp  deviceQuery_vs2017.vcxproj  deviceQuery_vs2022.sln	 NsightEclipse.xml
deviceQuery.o	 deviceQuery_vs2019.sln      deviceQuery_vs2022.vcxproj  README.md
cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "Tesla T4"
  CUDA Driver Version / Runtime Version          12.2 / 12.2
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 15102 MBytes (15835660288 bytes)
  (040) Multiprocessors, (064) CUDA Cores/MP:    2560 CUDA Cores
  GPU Max Clock rate:                            1590 MHz (1.59 GHz)
  Memory Clock rate:                             5001 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=

In [25]:
!pip install nvcc4jupyter



In [26]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [27]:
%%cuda

#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

__device__ long long factCalc_gpu(int n) {
    if (n == 0) return 1;
    long long fact = 1;
    for (int i = 1; i <= n; i++) {
        fact *= i;
    }
    return fact;
}

__device__ float sinApprox_gpu(float x, int p) {
    float res = 0.0;
    for (int i = 0; i < p; i++) {
        int exp = 2 * i + 1;
        float term = powf(-1, i) * powf(x, exp) / factCalc_gpu(exp);
        res += term;
    }
    return res;
}

__global__ void gpu_findSin(float* arr, float* res, int N, int p) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        res[i] = sinApprox_gpu(arr[i], p);
    }
}

void testTiming(int N, int p) {

    float *arr, *result, *d_arr, *d_result;
    size_t size = N * sizeof(float);

    arr = (float *)malloc(size);
    result = (float *)malloc(size);

    for (int i = 0; i < N; i++) {
        arr[i] = (float)i / N;
    }

    cudaMalloc((void **)&d_arr, size);
    cudaMalloc((void **)&d_result, size);

    cudaMemcpy(d_arr, arr, size, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    gpu_findSin<<<blocksPerGrid, threadsPerBlock>>>(d_arr, d_result, N, p);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    cudaMemcpy(result, d_result, size, cudaMemcpyDeviceToHost);

    float time = 0;
    cudaEventElapsedTime(&time, start, stop);
    time /= 1000;

    printf("%d\t%d\t%f\n", N, p, time);

    cudaFree(d_arr);
    cudaFree(d_result);
    free(arr);
    free(result);
}

int main() {
    int min_p = 3, max_p = 100;
    int min_N = 1 << 2;
    int max_N = 1 << 15;

    printf("N\tp\tTime\n");
    for (int p = min_p; p <= max_p; p += 10) {
        for (int N = min_N; N <= max_N; N *= 2) {
            testTiming(N, p);
        }
    }

    return 0;
}

N	p	Time
4	3	0.000234
8	3	0.000016
16	3	0.000013
32	3	0.000012
64	3	0.000012
128	3	0.000012
256	3	0.000013
512	3	0.000012
1024	3	0.000012
2048	3	0.000013
4096	3	0.000012
8192	3	0.000013
16384	3	0.000010
32768	3	0.000017
4	13	0.000032
8	13	0.000031
16	13	0.000031
32	13	0.000031
64	13	0.000033
128	13	0.000034
256	13	0.000037
512	13	0.000037
1024	13	0.000037
2048	13	0.000036
4096	13	0.000036
8192	13	0.000033
16384	13	0.000035
32768	13	0.000060
4	23	0.000055
8	23	0.000057
16	23	0.000060
32	23	0.000062
64	23	0.000063
128	23	0.000065
256	23	0.000068
512	23	0.000065
1024	23	0.000063
2048	23	0.000063
4096	23	0.000068
8192	23	0.000067
16384	23	0.000077
32768	23	0.000142
4	33	0.000089
8	33	0.000094
16	33	0.000096
32	33	0.000097
64	33	0.000100
128	33	0.000097
256	33	0.000099
512	33	0.000097
1024	33	0.000097
2048	33	0.000099
4096	33	0.000095
8192	33	0.000098
16384	33	0.000136
32768	33	0.000246
4	43	0.000117
8	43	0.000121
16	43	0.000125
32	43	0.000129
64	43	0.000130
128	43	0.000126
256	43	0.000138
