<a href="https://colab.research.google.com/github/nyck33/mlir-python-extras-copy/blob/main/mlirAsStringPythonRunSm75.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CUDA/NVGPU/NVVM E2E

In [None]:
!pip install -q  mlir_python_bindings==19.0.0.2024033101+cuda.a67b9326 -f https://makslevental.github.io/wheels
!pip install -q git+https://github.com/makslevental/mlir-python-extras.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for mlir-python-extras (pyproject.toml) ... [?25l[?25hdone


# Boilerplate

In [None]:
from pathlib import Path

import mlir.extras.types as T
from mlir.dialects import builtin
from mlir.dialects.transform import any_op_t
from mlir.dialects.transform.extras import named_sequence
from mlir.dialects.transform.structured import MatchInterfaceEnum
from mlir.ir import StringAttr, UnitAttr, Module

from mlir import _mlir_libs
from mlir.extras.ast.canonicalize import canonicalize
from mlir.extras.context import RAIIMLIRContext, ExplicitlyManagedModule
from mlir.extras.dialects.ext import arith, memref, scf, gpu
from mlir.extras.dialects.ext import linalg
from mlir.extras.dialects.ext import transform
from mlir.extras.dialects.ext.func import func
from mlir.extras.runtime.passes import Pipeline, run_pipeline
from mlir.extras.runtime.refbackend import LLVMJITBackend
from mlir.extras.util import find_ops

CUDA_RUNTIME_LIB_PATH = Path(_mlir_libs.__file__).parent / f"libmlir_cuda_runtime.so"
assert CUDA_RUNTIME_LIB_PATH.exists()

# Context

In [None]:
ctx = RAIIMLIRContext()

src = """\
func.func @main() {
  %data = memref.alloc() : memref<2x6xi32>
  %sum = memref.alloc() : memref<2xi32>
  %cst0 = arith.constant 0 : i32
  %cst1 = arith.constant 1 : i32
  %cst2 = arith.constant 2 : i32
  %cst4 = arith.constant 4 : i32
  %cst8 = arith.constant 8 : i32
  %cst16 = arith.constant 16 : i32

  %cst3 = arith.constant 3 : i32
  %cst6 = arith.constant 6 : i32
  %cst7 = arith.constant 7 : i32
  %cst10 = arith.constant 10 : i32
  %cst11 = arith.constant 11 : i32

  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c3 = arith.constant 3 : index
  %c4 = arith.constant 4 : index
  %c5 = arith.constant 5 : index
  %c6 = arith.constant 6 : index

  %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
  gpu.host_register %cast_data : memref<*xi32>
  %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
  gpu.host_register %cast_sum : memref<*xi32>

  memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
  memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
  memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
  memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
  memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
  memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>

  memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
  memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
  memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
  memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
  memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
  memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>

  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
             threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
    %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
    %reduced = gpu.all_reduce and %val uniform {} : (i32) -> (i32)
    memref.store %reduced, %sum[%bx] : memref<2xi32>
    gpu.terminator
  }

  call @printMemrefI32(%cast_sum) : (memref<*xi32>) -> ()

  return
}

func.func private @printMemrefI32(memref<*xi32>)
"""

module = Module.parse(src)

# Lower to NVVM (and LLVM)

In [None]:
backend = LLVMJITBackend([CUDA_RUNTIME_LIB_PATH])
# this doesn't actually anything (no pipeline) but does generate C API/wrappers
compiled_module = backend.compile(
    module,
    Pipeline().add_pass(
        "gpu-lower-to-nvvm-pipeline",
        **{
            "cubin-chip": "sm_75",
            "cubin-features": "+ptx75",
            "cubin-format": "fatbin",
        },
    ),
)
print(compiled_module)

module attributes {gpu.container_module} {
  llvm.func @malloc(i64) -> !llvm.ptr
  llvm.func @main() attributes {llvm.emit_c_interface} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.constant(1 : i32) : i32
    %2 = llvm.mlir.constant(2 : i32) : i32
    %3 = llvm.mlir.constant(4 : i32) : i32
    %4 = llvm.mlir.constant(8 : i32) : i32
    %5 = llvm.mlir.constant(16 : i32) : i32
    %6 = llvm.mlir.constant(3 : i32) : i32
    %7 = llvm.mlir.constant(6 : i32) : i32
    %8 = llvm.mlir.constant(7 : i32) : i32
    %9 = llvm.mlir.constant(10 : i32) : i32
    %10 = llvm.mlir.constant(11 : i32) : i32
    %11 = llvm.mlir.constant(0 : index) : i64
    %12 = llvm.mlir.constant(1 : index) : i64
    %13 = llvm.mlir.constant(2 : index) : i64
    %14 = llvm.mlir.constant(3 : index) : i64
    %15 = llvm.mlir.constant(4 : index) : i64
    %16 = llvm.mlir.constant(5 : index) : i64
    %17 = llvm.mlir.constant(6 : index) : i64
    %18 = llvm.mlir.zero : !llvm.ptr
    %19 = llvm.getelementp

# Load and run

In [None]:
!pip install -q wurlitzer
from wurlitzer import pipes

In [None]:
with pipes() as (out, err):
    backend.load(compiled_module).main_capi_wrapper()

In [None]:
print(out.read())

Unranked Memref base@ = 0x567703fb08a0 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
[0,  2]

