In [1]:
import torch
print("Colab GPU available:", torch.cuda.is_available())
# !nvidia-smi

Colab GPU available: True


In [2]:
!nvidia-smi

Tue Dec  2 11:11:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!ls

sample_data


In [4]:
!cmake --version

cmake version 3.31.10

CMake suite maintained and supported by Kitware (kitware.com/cmake).


# Run from here always after any change in git

In [5]:
!rm -rf ./*

In [6]:
!ls

# Make a new clone from the git repo and then run it

In [7]:
# Always start from /content
%cd /content

# Remove any old copies so paths don't get nested
!rm -rf GPU_mode

# Clone your repo (note the exact repo name in the URL)
!git clone https://github.com/parthshinde1221/GPU_mode.git

# Enter the repo root
%cd GPU_mode

# Sanity check: you MUST see CMakeLists.txt here
!ls


/content
Cloning into 'GPU_mode'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 92 (delta 48), reused 65 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (92/92), 43.76 KiB | 814.00 KiB/s, done.
Resolving deltas: 100% (48/48), done.
/content/GPU_mode
CMakeLists.txt	matrix_multiplication  reduction		vector_add
include		README.md	       test_colab_server.ipynb


# Building all CUDA kernels

In [8]:
# Configure the project (top-level CMakeLists.txt)
!cmake -S . -B build -DCMAKE_BUILD_TYPE=Release

# Build all targets (matmul, vec_add, etc.)
!cmake --build build -j 4


-- The CXX compiler identification is GNU 11.4.0
-- The CUDA compiler identification is NVIDIA 12.5.82 with host compiler GNU 11.4.0
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Adding matmul target: matmul_naive from /content/GPU_mode/matrix_multiplication/matmul_naive.cu
-- Adding matmul target: matmul_tiled from /content/GPU_mode/matrix_multiplication/matmul_tiled.cu
-- Adding vec_add target: vec_add_naive from /content/GPU_mode/vector_add/vec_add_naive.cu
-- Adding vec_add target: vec_add_opt from /content/GPU_mode/vector_add/vec_add_opt.cu
-- Adding reduction target: reduce_naive_add from /con

# Optional build single kernels

In [9]:
# kernels = ["matmul", "vec_add"]  # target names from each CMakeLists.txt

# for k in kernels:
#     print(f"\n=== Building {k} ===")
#     !cmake --build build --target {k} -j 4


# Running all Kernels

In [10]:
# Global kernels
# kernels = ["reduce_naive_add"]
# kernels = ["vec_add_naive","vec_add_opt"]  # add more later: "softmax", "conv", etc.

In [11]:
kernels = ["matmul_naive", "vec_add_naive","matmul_tiled","vec_add_opt","reduce_naive_add"]  # add more later: "softmax", "conv", etc.

for k in kernels:
    print(f"\n=== Running {k} ===")
    !./build/bin/{k}


=== Running matmul_naive ===
C[0] = 1024 (expected 1024)

=== Running vec_add_naive ===
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)

=== Running matmul_tiled ===
[tiled] C[0] = 1024 (expected 1024)

=== Running vec_add_opt ===
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)

=== Running reduce_naive_add ===
CPU sum: 1.57286e+06 | GPU sum: 1.57286e+06 | diff: 0


# CUDA MemCheck all kernels

In [12]:
!mkdir -p profiles

In [13]:
!ls

build		matrix_multiplication  reduction
CMakeLists.txt	profiles	       test_colab_server.ipynb
include		README.md	       vector_add


In [14]:
# !which cuda-memcheck
# !which compute-sanitizer

In [15]:
import os

os.makedirs("profiles", exist_ok=True)

tools   = ["memcheck", "racecheck"]
# kernels = ["matmul_naive", "vec_add_naive", "matmul_tiled", "vec_add_opt"]

for k in kernels:
    for t in tools:
        print(f"\n=== compute-sanitizer ({t}) on {k} ===")
        log = f"profiles/{k}_{t}.txt"
        !compute-sanitizer --tool {t} ./build/bin/{k} > {log} 2>&1
        print(f"Saved {log}")
        !tail -n 20 {log}



=== compute-sanitizer (memcheck) on matmul_naive ===
Saved profiles/matmul_naive_memcheck.txt
C[0] = 1024 (expected 1024)

=== compute-sanitizer (racecheck) on matmul_naive ===
Saved profiles/matmul_naive_racecheck.txt
C[0] = 1024 (expected 1024)

=== compute-sanitizer (memcheck) on vec_add_naive ===
Saved profiles/vec_add_naive_memcheck.txt
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)

=== compute-sanitizer (racecheck) on vec_add_naive ===
Saved profiles/vec_add_naive_racecheck.txt
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)

=== compute-sanitizer (memcheck) on matmul_tiled ===
Saved profiles/matmul_tiled_memcheck.txt
[tiled] C[0] = 1024 (expected 1024)

=== compute-sanitizer (racecheck) on matmul_tiled ===
Saved profiles/matmul_tiled_racecheck.txt
[tiled] C[0] = 1024 (expected 1024)

=== compute-sanitizer (memcheck) on vec_add_opt ===
Saved profiles/vec_add_opt_memcheck.txt
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)

=== compute-sanitizer (racecheck) on vec_add_opt ===
Saved p

# NCU Profile each kernel all kernels

In [16]:
!ls

build		matrix_multiplication  reduction
CMakeLists.txt	profiles	       test_colab_server.ipynb
include		README.md	       vector_add


In [17]:
!ls build

bin		CMakeFiles	     Makefile		    reduction
CMakeCache.txt	cmake_install.cmake  matrix_multiplication  vector_add


In [18]:
!ls build/bin

matmul_naive  matmul_tiled  reduce_naive_add  vec_add_naive  vec_add_opt


In [19]:
# from IPython.display import IFrame
# import os

# os.makedirs("profiles", exist_ok=True)

# kernels = ["matmul", "vec_add"]

# for k in kernels:
#     print(f"\n=== Profiling {k} with ncu ===")
#     # NCU_DEFAULTS="" clears any default --export that Colab may set
#     !NCU_DEFAULTS="" ncu -f --set full --export html -o profiles/{k} ./build/bin/{k}
#     display(IFrame(f"profiles/{k}.html", width=1024, height=600))


In [20]:
import os

os.makedirs("profiles", exist_ok=True)

# kernels = ["matmul_naive", "matmul_tiled","vec_add_naive","vec_add_opt"]

# -------- First loop: run profiling & save reports --------
print("=== Running Nsight Compute and saving reports ===")
for k in kernels:
    print(f"\n[1] Profiling {k} with ncu ...")
    !NCU_DEFAULTS="" ncu -f --set full -o profiles/{k} ./build/bin/{k}
    print(f"--> Saved report: profiles/{k}.ncu-rep")
    
    # Print the kernel Duration from the report
    print(f"--> Duration for {k}:")
    !NCU_DEFAULTS="" ncu --import profiles/{k}.ncu-rep --page details | grep "Duration"

print("\nAll reports saved:")
!ls -lh profiles


=== Running Nsight Compute and saving reports ===

[1] Profiling matmul_naive with ncu ...
==PROF== Connected to process 1797 (/content/GPU_mode/build/bin/matmul_naive)
==PROF== Profiling "matmul_kernel" - 0: 0%....50%....100% - 30 passes
C[0] = 1024 (expected 1024)
==PROF== Disconnected from process 1797
==PROF== Report: /content/GPU_mode/profiles/matmul_naive.ncu-rep
--> Saved report: profiles/matmul_naive.ncu-rep
--> Duration for matmul_naive:
    Duration                         ms         2.08

[1] Profiling vec_add_naive with ncu ...
==PROF== Connected to process 1894 (/content/GPU_mode/build/bin/vec_add_naive)
==PROF== Profiling "vec_add_kernel" - 0: 0%....50%....100% - 30 passes
c[0] = 3 (expected 3)
c[N-1] = 3 (expected 3)
==PROF== Disconnected from process 1894
==PROF== Report: /content/GPU_mode/profiles/vec_add_naive.ncu-rep
--> Saved report: profiles/vec_add_naive.ncu-rep
--> Duration for vec_add_naive:
    Duration                         us        49.02

[1] Profiling mat

In [21]:
# -------- Second loop: re-import reports and show details --------
print("\n=== Printing Nsight Compute details for each kernel ===")
for k in kernels:
    print(f"\n[2] Nsight Compute details page for {k}")
    !NCU_DEFAULTS="" ncu --import profiles/{k}.ncu-rep --page details


=== Printing Nsight Compute details for each kernel ===

[2] Nsight Compute details page for matmul_naive
[1797] matmul_naive@127.0.0.1
  matmul_kernel(const float *, const float *, float *, int) (32, 32, 1)x(16, 16, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.99
    SM Frequency                    Mhz       584.98
    Elapsed Cycles                cycle    1,218,168
    Memory Throughput                 %        51.76
    DRAM Throughput                   %         0.98
    Duration                         ms         2.08
    L1/TEX Cache Throughput           %        95.72
    L2 Cache Throughput               %        34.24
    SM Active Cycles              cycle 1,195,125.77
    Compute (SM) Throughput           %        51.76
    ----

In [22]:
!ls

build		matrix_multiplication  reduction
CMakeLists.txt	profiles	       test_colab_server.ipynb
include		README.md	       vector_add


# Create a Zip File

In [23]:
# import os
# import shutil

# # Adjust this if your repo root is different
# repo_root = "/content/GPU_mode"
# profiles_dir = os.path.join(repo_root, "profiles")

# # Make sure the profiles folder exists
# if os.path.isdir(profiles_dir):
#     # Create profiles.zip next to the repo root
#     zip_path = os.path.join(repo_root, "profiles")
#     shutil.make_archive(zip_path, "zip", profiles_dir)
#     print(f"Created ZIP: {zip_path}.zip")
# else:
#     print(f"No profiles folder found at: {profiles_dir}")

In [24]:
import os
import shutil
import datetime

repo_root = "/content/GPU_mode"
profiles_dir = os.path.join(repo_root, "profiles")

# Make timestamped name
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
zip_basename = f"profiles_{timestamp}"             # without .zip
zip_path = os.path.join(repo_root, zip_basename)   # /content/GPU_mode/profiles_YYYYMMDD_HHMMSS

if os.path.isdir(profiles_dir):
    # Create /content/GPU_mode/profiles_YYYYMMDD_HHMMSS.zip
    shutil.make_archive(zip_path, "zip", profiles_dir)
    zip_file = zip_path + ".zip"
    print(f"Created ZIP: {zip_file}")
else:
    print(f"No profiles folder found at: {profiles_dir}")


Created ZIP: /content/GPU_mode/profiles_20251202_111330.zip


In [25]:
!ls

build		       profiles			     test_colab_server.ipynb
CMakeLists.txt	       profiles_20251202_111330.zip  vector_add
include		       README.md
matrix_multiplication  reduction


# File Downloader Widget/Workflow Provided by Colab For the Profiler Zip File

In [26]:
import anywidget
import traitlets
import os
import base64
import mimetypes

class FileDownloader(anywidget.AnyWidget):
    """
    An anywidget that renders a button. When clicked, it triggers a server-side
    read of 'file_path' and sends the content to the browser for download.

    The button starts disabled and only enables after a successful handshake
    with the Python kernel, ensuring it doesn't appear active in a dead notebook.
    """

    # The path to the file on the server/local disk that you want to download
    file_path = traitlets.Unicode(help="Path to the file to be downloaded").tag(sync=True)

    # Label for the button
    button_text = traitlets.Unicode("Download File").tag(sync=True)

    _esm = """
    export function render({ model, el }) {
      // Create the button element
      let btn = document.createElement("button");
      btn.classList.add("jupyter-widgets", "jupyter-button", "widget-button");
      btn.style.width = "100%";

      // Initial state: Disabled and waiting
      btn.innerText = "Waiting for Kernel...";
      btn.disabled = true;

      // Update button text if the Python trait changes
      model.on("change:button_text", () => {
        // Only update visually if we are already connected/enabled
        if (!btn.disabled) {
            btn.innerText = model.get("button_text");
        }
      });

      // Handle the click event
      btn.addEventListener("click", () => {
        const filePath = model.get("file_path");

        if (!filePath) {
            alert("No file path set in the Python widget!");
            return;
        }

        // Disable button and show loading state
        const originalText = btn.innerText;
        btn.innerText = "Downloading...";
        btn.disabled = true;

        // Send a request message to the Python backend
        model.send({ type: "request_download" });

        // Helper to restore button state
        const restoreBtn = () => {
            btn.innerText = originalText;
            btn.disabled = false;
        };

        // Timeout safety to restore button if Python doesn't respond within 5s
        setTimeout(restoreBtn, 5000);
      });

      el.appendChild(btn);

      // Listen for messages coming from Python
      model.on("msg:custom", (msg) => {
        if (msg.type === "connection_verified") {
            // HANDSHAKE COMPLETE: Kernel is alive.
            btn.disabled = false;
            btn.innerText = model.get("button_text");
        }
        else if (msg.type === "file_content") {
            // 1. Create a Blob from the Base64 data
            const byteCharacters = atob(msg.content);
            const byteNumbers = new Array(byteCharacters.length);
            for (let i = 0; i < byteCharacters.length; i++) {
                byteNumbers[i] = byteCharacters.charCodeAt(i);
            }
            const byteArray = new Uint8Array(byteNumbers);
            const blob = new Blob([byteArray], { type: msg.mime_type });

            // 2. Create a temporary link to trigger the download
            const url = window.URL.createObjectURL(blob);
            const a = document.createElement("a");
            a.style.display = "none";
            a.href = url;
            a.download = msg.filename;
            document.body.appendChild(a);
            a.click();

            // 3. Cleanup
            window.URL.revokeObjectURL(url);
            document.body.removeChild(a);

            // Restore button text
            btn.innerText = model.get("button_text");
            btn.disabled = false;

        } else if (msg.type === "error") {
            alert(`Error: ${msg.message}`);
            btn.innerText = model.get("button_text");
            btn.disabled = false;
        }
      });

      // INITIATE HANDSHAKE
      // Send a message to Python to check if the kernel is listening.
      // If the kernel is dead (saved notebook), this message goes nowhere,
      // and the button remains disabled.
      setTimeout(() => {
        model.send({ type: "check_connection" });
      }, 500);
    }
    """

    def __init__(self, file_path=None, **kwargs):
        super().__init__(**kwargs)
        if file_path:
            self.file_path = file_path

        # Register the message handler
        self.on_msg(self._handle_custom_msg)

    def _handle_custom_msg(self, msg, content):
        """
        Callback for when the frontend sends a message to Python.
        """
        msg_type = msg.get("type")

        if msg_type == "check_connection":
            # Reply to the frontend to confirm we are alive
            self.send({"type": "connection_verified"})

        elif msg_type == "request_download":
            self._process_download()

    def _process_download(self):
        """
        Reads the file from disk and sends it to the frontend.
        """
        target_path = self.file_path

        # Basic validation
        if not target_path:
            self.send({"type": "error", "message": "File path is not defined."})
            return

        if not os.path.exists(target_path):
            self.send({"type": "error", "message": f"File not found: {target_path}"})
            return

        try:
            # Guess the MIME type so the browser handles it correctly
            mime_type, _ = mimetypes.guess_type(target_path)
            if mime_type is None:
                mime_type = 'application/octet-stream'

            # Read and encode the file
            with open(target_path, "rb") as f:
                file_content = f.read()

            b64_content = base64.b64encode(file_content).decode("utf-8")

            # Send back to JS
            self.send({
                "type": "file_content",
                "filename": os.path.basename(target_path),
                "mime_type": mime_type,
                "content": b64_content
            })

        except Exception as e:
            self.send({"type": "error", "message": str(e)})

# dummy_filename = "example_data.txt"
# with open(dummy_filename, "w") as f:
#     f.write("Hello! This is a file dynamically read from the kernel disk.\n")
#     f.write("If you are reading this, the widget worked.")



In [27]:
print(zip_file)

/content/GPU_mode/profiles_20251202_111330.zip


In [28]:
!ls

build		       profiles			     test_colab_server.ipynb
CMakeLists.txt	       profiles_20251202_111330.zip  vector_add
include		       README.md
matrix_multiplication  reduction


In [None]:
# Use the timestamped zip with FileDownloader - Used to Download the profiles zip folder
# FileDownloader(file_path=zip_file)

<__main__.FileDownloader object at 0x7f4246627ad0>

In [33]:
# from IPython.display import FileLink

# zip_path = "/content/GPU_mode/profiles_run_2025-11-28T19-12-00.zip"
# FileLink(zip_file)
