<a href="https://colab.research.google.com/github/mentalMint/gpu-programming-fit/blob/main/Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
%%writefile cuda_sin_test.cu

#include <iostream>
#include <cmath>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>

#define PI M_PI

const size_t N = 1000000000ULL;

template <typename T>
__global__ void kernel_sin(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = sin((T)(i % 360) * PI / 180.0);
}

template <typename T>
__global__ void kernel_sinf(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = sinf((T)(i % 360) * (T)PI / 180.0f);
}

template <typename T>
__global__ void kernel___sinf(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = __sinf((T)(i % 360) * (T)PI / 180.0f);
}

template<typename T>
T compute_error(const T* arr) {
    T err = 0.0;
    for (size_t i = 0; i < N; ++i) {
        T ref = sin((i % 360) * PI / 180.0);
        err += fabs(ref - (T)arr[i]);
    }
    return err / N;
}

template<typename T>
void test() {
    T *d_arr, *h_arr;
    h_arr = new T[N];
    cudaMalloc(&d_arr, N * sizeof(T));

    dim3 block(256);
    dim3 grid((N + block.x - 1) / block.x);

    {
        std::cout << "Запуск sin() ..." << std::endl;
        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
        kernel_sin<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Время: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Ошибка: " << err << std::endl << std::endl;
    }

    {
        std::cout << "Запуск sinf() ..." << std::endl;
        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
        kernel_sinf<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Время: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Ошибка: " << err << std::endl << std::endl;
    }

    {
        std::cout << "Запуск __sinf() ..." << std::endl;
        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
        kernel___sinf<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Время: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Ошибка: " << err << std::endl << std::endl;
    }

    cudaFree(d_arr);
    delete[] h_arr;
}

int main() {
    std::cout << "CUDA sin/sinf/__sinf experiment" << std::endl;
    std::cout << "Array size: " << N << std::endl;

    std::cout << "\n=== FLOAT ===" << std::endl;
    test<float>();
    std::cout << "\n=== DOUBLE ===" << std::endl;
    test<double>();

    return 0;
}

Overwriting cuda_sin_test.cu


In [31]:
!nvcc -arch=sm_75 --use_fast_math cuda_sin_test.cu -o cuda_sin_test
!./cuda_sin_test

CUDA sin/sinf/__sinf experiment
Array size: 1000000000

=== FLOAT ===
Запуск sin() ...
Время: 420794[µs]
Ошибка: 0

Запуск sinf() ...
Время: 16804[µs]
Ошибка: 1.6e-08

Запуск __sinf() ...
Время: 16820[µs]
Ошибка: 1.6e-08


=== DOUBLE ===
Запуск sin() ...
Время: 309773[µs]
Ошибка: 8.77963e-18

Запуск sinf() ...
Время: 256302[µs]
Ошибка: 1.30149e-07

Запуск __sinf() ...
Время: 263361[µs]
Ошибка: 1.30149e-07



In [32]:
!nvidia-smi

Tue Oct 14 04:58:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P0             34W /   70W |       0MiB /  15360MiB |     62%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [33]:
%%writefile cuda_specs.cu

#include <iostream>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount); // Get the number of CUDA-capable devices

    if (deviceCount == 0) {
        std::cerr << "No CUDA devices found." << std::endl;
        return 1;
    }

    for (int i = 0; i < deviceCount; ++i) {
        cudaDeviceProp prop{}; // Initialize a cudaDeviceProp structure
        cudaGetDeviceProperties(&prop, i); // Get properties for device 'i'

        std::cout << "--- Device Number: " << i << " ---" << std::endl;
        std::cout << "  Device Name: " << prop.name << std::endl;
        std::cout << "  Compute Capability: " << prop.major << "." << prop.minor << std::endl;
        std::cout << "  Total Global Memory (bytes): " << prop.totalGlobalMem << std::endl;
        std::cout << "  Max Threads per Block: " << prop.maxThreadsPerBlock << std::endl;
        std::cout << "  Multiprocessor Count: " << prop.multiProcessorCount << std::endl;
        std::cout << "  Clock Rate (kHz): " << prop.clockRate << std::endl;
        std::cout << "  Shared Memory per Block (bytes): " << prop.sharedMemPerBlock << std::endl;
        std::cout << "  Warp Size: " << prop.warpSize << std::endl;
        std::cout << "  ECC Enabled: " << (prop.ECCEnabled ? "Yes" : "No") << std::endl;
        std::cout << std::endl;
    }

    return 0;
}

Overwriting cuda_specs.cu


In [34]:
!nvcc -arch=sm_75 cuda_specs.cu -o cuda_specs
!./cuda_specs

--- Device Number: 0 ---
  Device Name: Tesla T4
  Compute Capability: 7.5
  Total Global Memory (bytes): 15828320256
  Max Threads per Block: 1024
  Multiprocessor Count: 40
  Clock Rate (kHz): 1590000
  Shared Memory per Block (bytes): 49152
  Warp Size: 32
  ECC Enabled: Yes

