<a href="https://colab.research.google.com/github/manojg24/A-complete-optimizing-compiler/blob/main/GPU_Kernel_Optimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi
!nvcc --version


Sat Oct 11 14:09:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   53C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
%%writefile vector_add.cu
#include <stdio.h>

__global__ void vecAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

int main() {
    int N = 1 << 20; //1 mil elements
    size_t size = N * sizeof(float);

    float *hA, *hB, *hC, *dA, *dB, *dC;
    hA = (float*)malloc(size);
    hB = (float*)malloc(size);
    hC = (float*)malloc(size);

    for (int i=0; i<N; i++) {
      hA[i] = i;
      hB[i] = i;
    }

    cudaMalloc(&dA, size); cudaMalloc(&dB, size); cudaMalloc(&dC, size);
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    vecAdd<<<numBlocks, blockSize>>>(dA, dB, dC, N);

    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    for (int i = 0; i < 5; i++)
        std::cout << h_C[i] << " ";
    std::cout << "..." << std::endl;

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
    free(hA);
    free(hB);
    free(hC);
    printf("Done.\\n");
}


Writing vector_add.cu


In [7]:
!nvcc vector_add.cu -o vector_add
!./vector_add

Done.\n

In [9]:
!apt install -y nvidia-nsight-compute

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package nvidia-nsight-compute[0m


In [11]:
!nvprof --version
!nvprof ./vector_add

nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2024 NVIDIA Corporation
Release version 12.5.82 (21)
==8072== NVPROF is profiling process 8072, command: ./vector_add
Done.\n==8072== Profiling application: ./vector_add
==8072== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   50.84%  1.5683ms         2  784.17us  774.54us  793.81us  [CUDA memcpy HtoD]
                   49.16%  1.5164ms         1  1.5164ms  1.5164ms  1.5164ms  [CUDA memcpy DtoH]
      API calls:   87.38%  92.467ms         3  30.822ms  105.61us  92.125ms  cudaMalloc
                    7.37%  7.7950ms         1  7.7950ms  7.7950ms  7.7950ms  cudaLaunchKernel
                    4.55%  4.8149ms         3  1.6050ms  969.73us  2.8368ms  cudaMemcpy
                    0.51%  536.77us         3  178.92us  135.28us  200.79us  cudaFree
                    0.17%  176.44us       114  1.5470us     107ns  89.309us  cuDeviceGetAttribute
        

In [24]:
%%writefile vector_add.cu
#include <stdio.h>

__global__ void vectorAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

int main() {
    int N = 1 << 20; // 1 million elements
    int bytes_transferred = N * 4 *3;
    size_t size = N * sizeof(float);

    // Host allocation
    float *hA = (float*)malloc(size);
    float *hB = (float*)malloc(size);
    float *hC = (float*)malloc(size);
    for (int i = 0; i < N; i++) { hA[i] = 1.0f; hB[i] = 2.0f; }

    // Device allocation
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size); cudaMalloc(&dB, size); cudaMalloc(&dC, size);

    // Copy host → device
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    // Start timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    vectorAdd<<<blocks, threads>>>(dA, dB, dC, N);
    cudaDeviceSynchronize();

    // Stop timer
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %.4f ms\n", milliseconds);

    float throughput_GBps = (bytes_transferred / 1e9) / (milliseconds / 1e3);
    printf("Approx. memory throughput: {throughput_GBps:.2f} GB/s");

    // Copy device → host
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    // Print first few results
    for (int i = 0; i < 5; i++) printf("%.1f ", hC[i]);
    printf("...\n");

    // Cleanup
    cudaFree(dA); cudaFree(dB); cudaFree(dC);
    free(hA); free(hB); free(hC);
    return 0;
}


Overwriting vector_add.cu


In [25]:
!nvcc vector_add.cu -o vector_add
!./vector_add

Kernel execution time: 8.3415 ms
Approx. memory throughput: {throughput_GBps:.2f} GB/s0.0 0.0 0.0 0.0 0.0 ...
