In [None]:
# How to use basic profiling tools

In [None]:
%%writefile test.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <unistd.h>

__global__ void profiler() { for(int i = 0; i < 1; i++) {} }

int main() {
    printf("Starting test on T4 GPU...\n");

    profiler<<<dim3(40, 1, 1), dim3(1024, 1, 1)>>>();

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error launching infinite kernel: %s\n", cudaGetErrorString(err));
        return -1;
    }

    sleep(1);

    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error launching test kernel: %s\n", cudaGetErrorString(err));
        return -1;
    }

    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) { printf("Sync error: %s\n", cudaGetErrorString(err)); }
    return 0;
}


Writing test.cu


In [None]:
!nvcc -arch=sm_75 -o test test.cu
!ncu --set full --section SchedulerStats --section WarpStateStats ./test

Starting test on T4 GPU...
==PROF== Connected to process 325 (/content/test)
==PROF== Profiling "profiler()" - 0: 0%....50%....100% - 30 passes
==PROF== Disconnected from process 325
[325] test@127.0.0.1
  profiler() (40, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.80
    SM Frequency                    Mhz       572.27
    Elapsed Cycles                cycle        1,465
    Memory Throughput                 %         0.96
    DRAM Throughput                   %         0.35
    Duration                         us         2.56
    L1/TEX Cache Throughput           %         1.95
    L2 Cache Throughput               %         0.96
    SM Active Cycles              cycle       307.95
    Compute (SM) Throughput           %  

In [None]:
!ncu --section WarpStateStats --section SchedulerStats --page details ./test

Starting test on T4 GPU...
==PROF== Connected to process 1924 (/content/test)
==PROF== Profiling "profiler()" - 0: 0%....50%....100% - 8 passes
==PROF== Disconnected from process 1924
[1924] test@127.0.0.1
  profiler() (40, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: Scheduler Statistics
    ---------------------------- ----------- ------------
    Metric Name                  Metric Unit Metric Value
    ---------------------------- ----------- ------------
    One or More Eligible                   %        11.23
    Issued Warp Per Scheduler                        0.11
    No Eligible                            %        88.77
    Active Warps Per Scheduler          warp         6.88
    Eligible Warps Per Scheduler        warp         0.13
    ---------------------------- ----------- ------------

    OPT   Est. Local Speedup: 88.77%                                                                                    
          Every scheduler is capable of 