# Fusing graphblas.matrix_multiply with graphblas.matrix_apply

This example will go over how to use the `--graphblas-optimize` pass from `graphblas-opt` to fuse `graphblas.matrix_multiply` ops with `graphblas.matrix_apply` ops into `graphblas.matrix_multiply` ops with a region attached.

Let's first import some necessary libraries.

In [1]:
import tempfile
from mlir_graphblas.cli import GRAPHBLAS_OPT_EXE

Since [sparse tensor encodings](https://mlir.llvm.org/docs/Dialects/SparseTensorOps/#sparsetensorencodingattr) can be very verbose in MLIR, let's write some helpers to make the MLIR code more readable.

In [2]:
def tersify_mlir(input_string: str) -> str:
    terse_string = input_string
    terse_string = terse_string.replace(
        '''#sparse_tensor.encoding<{ '''
        '''dimLevelType = [ "dense", "compressed" ], '''
        '''dimOrdering = affine_map<(d0, d1) -> (d0, d1)>, '''
        '''pointerBitWidth = 64, '''
        '''indexBitWidth = 64 '''
        '''}>''', 
        "#CSR64")
    terse_string = terse_string.replace(
        '''#sparse_tensor.encoding<{ '''
        '''dimLevelType = [ "dense", "compressed" ], '''
        '''dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, '''
        '''pointerBitWidth = 64, '''
        '''indexBitWidth = 64 '''
        '''}>''', 
        "#CSC64")
    return terse_string

## Fusion Details

Recall that `graphblas.matrix_multiply` can take an optional region, e.g. this code squares each element of the matrix multiply product:
```
%answer = graphblas.matrix_multiply %argA, %argB { semiring = "plus_times" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64> {
        ^bb0(%value: f64):
            %result = std.mulf %value, %value: f64
            graphblas.yield %result : f64
    }
```

Since `graphblas.matrix_apply` ops only change tensors in an element-wise fashion, we can perform these element-wise changes in the region of a `graphblas.matrix_multiply` op if the `graphblas.matrix_apply` op is run on the result of a `graphblas.matrix_multiply` op.

## Simple Fusion

Here, we'll show the simplest example of how we can fuse a `graphblas.matrix_multiply` op with a `graphblas.matrix_apply` op.

In [3]:
mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (i,j)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (j,i)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

func @fuse_adjacent(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>, %thunk: f64) -> tensor<?x?xf64, #CSR64> {
    %C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64> 
    %apply_result = graphblas.matrix_apply %C, %thunk { apply_operator = "min" } : (tensor<?x?xf64, #CSR64>, f64) to tensor<?x?xf64, #CSR64>
    return %apply_result : tensor<?x?xf64, #CSR64>
}
"""

with tempfile.NamedTemporaryFile() as temp:
    temp_file_name = temp.name
    with open(temp_file_name, 'w') as f:
        f.write(mlir_text)
    temp.flush()

    output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-optimize
    output_mlir = "\n".join(output_mlir)
    output_mlir = tersify_mlir(output_mlir)

print(output_mlir)

module  {
  func @fuse_adjacent(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>, %arg2: f64) -> tensor<?x?xf64, #CSR64> {
    %0 = graphblas.matrix_multiply %arg0, %arg1 {semiring = "plus_plus"} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>  {
    ^bb0(%arg3: f64):  // no predecessors
      %1 = cmpf olt, %arg3, %arg2 : f64
      %2 = select %1, %arg3, %arg2 : f64
      graphblas.yield %2 : f64
    }
    return %0 : tensor<?x?xf64, #CSR64>
  }
}



The code in the region attached to the `graphblas.matrix_multiply` in the lowered MLIR here may seem confusing at first, but it's simply calculating the minimum of each element (i.e. `%arg3`) and the thunk (i.e. `%thunk` or `%arg2`).

## Simple Fusion with Mask

Fusion of `graphblas.matrix_multiply` ops with `graphblas.matrix_apply` ops also works if the `graphblas.matrix_multiply` use takes a mask. 

In [4]:
mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (i,j)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (j,i)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

func @fuse_adjacent_with_mask(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>, %thunk: f64) -> tensor<?x?xf64, #CSR64> {
    %C = graphblas.matrix_multiply %A, %B, %A { semiring = "plus_pair" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>, tensor<?x?xf64, #CSR64>) to tensor<?x?xf64, #CSR64> 
    %apply_result = graphblas.matrix_apply %C, %thunk { apply_operator = "min" } : (tensor<?x?xf64, #CSR64>, f64) to tensor<?x?xf64, #CSR64>
    return %apply_result : tensor<?x?xf64, #CSR64>
}
"""

with tempfile.NamedTemporaryFile() as temp:
    temp_file_name = temp.name
    with open(temp_file_name, 'w') as f:
        f.write(mlir_text)
    temp.flush()

    output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-optimize
    output_mlir = "\n".join(output_mlir)
    output_mlir = tersify_mlir(output_mlir)

print(output_mlir)

module  {
  func @fuse_adjacent_with_mask(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>, %arg2: f64) -> tensor<?x?xf64, #CSR64> {
    %0 = graphblas.matrix_multiply %arg0, %arg1, %arg0 {semiring = "plus_pair"} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>, tensor<?x?xf64, #CSR64>) to tensor<?x?xf64, #CSR64>  {
    ^bb0(%arg3: f64):  // no predecessors
      %1 = cmpf olt, %arg3, %arg2 : f64
      %2 = select %1, %arg3, %arg2 : f64
      graphblas.yield %2 : f64
    }
    return %0 : tensor<?x?xf64, #CSR64>
  }
}



## Non-applicable Fusion

One thing to note is that if the result of any intermediate values of the ops being fused, e.g. the result of a `graphblas.matrix_multiply`, is used elsewhere, the fusion cannot and will not apply as shown here. 

In [5]:
mlir_text = """
#CSR64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (i,j)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

#CSC64 = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
  dimOrdering = affine_map<(i,j) -> (j,i)>,
  pointerBitWidth = 64,
  indexBitWidth = 64
}>

func @nofuse_multi_use(%A: tensor<?x?xf64, #CSR64>, %B: tensor<?x?xf64, #CSC64>, %thunk: f64) -> (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSR64>) {
    %C = graphblas.matrix_multiply %A, %B { semiring = "plus_plus" } : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64> 
    %apply_result = graphblas.matrix_apply %C, %thunk { apply_operator = "min" } : (tensor<?x?xf64, #CSR64>, f64) to tensor<?x?xf64, #CSR64>
    return %apply_result, %C : tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSR64>
}
"""

with tempfile.NamedTemporaryFile() as temp:
    temp_file_name = temp.name
    with open(temp_file_name, 'w') as f:
        f.write(mlir_text)
    temp.flush()

    output_mlir = ! cat $temp_file_name | $GRAPHBLAS_OPT_EXE --graphblas-optimize
    output_mlir = "\n".join(output_mlir)
    output_mlir = tersify_mlir(output_mlir)

print(output_mlir)

module  {
  func @nofuse_multi_use(%arg0: tensor<?x?xf64, #CSR64>, %arg1: tensor<?x?xf64, #CSC64>, %arg2: f64) -> (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSR64>) {
    %0 = graphblas.matrix_multiply %arg0, %arg1 {semiring = "plus_plus"} : (tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSC64>) to tensor<?x?xf64, #CSR64>
    %1 = graphblas.matrix_apply %0, %arg2 {apply_operator = "min"} : (tensor<?x?xf64, #CSR64>, f64) to tensor<?x?xf64, #CSR64>
    return %1, %0 : tensor<?x?xf64, #CSR64>, tensor<?x?xf64, #CSR64>
  }
}

