In [1]:
import islpy as isl
from latex_op import display_latex
from islplot_support import print_before_after

# Classical Loop Transformations

### Setup AST generation infrastructure

## Loop Reversal

Loop reversal changes the direction in which elements of a loop are visited. After loop reversal, the previous first loop iteration is executed last and the previous last loop iteration is executed first.

**Benefits**:
- Can be used to shorten dependences

In [2]:
domain = isl.UnionSet("[n] -> {S[i] : 0 <= i < n}")
original = isl.UnionMap("{S[i] -> [i]}")
transformation = isl.UnionMap("{[i] -> [-i]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 < n; c0 += 1)
  S(c0);

After Transform:

for (int c0 = -n + 1; c0 <= 0; c0 += 1)
  S(-c0);



# Loop Fusion

After Loop fusion two statements that have previously been enumerated by different loops are
now enumerated by a single loop.

**Benefits:**
  - Improves data-locality


In [3]:
domain = isl.UnionSet("[n] -> {S[i] : 0 <= i <= n; T[i] : 0 <= i <= n}")
original = isl.UnionMap("{S[i] -> [0, i]; T[i] -> [1, i]}")
transformation = isl.UnionMap("{[0, i] -> [i,0]; [1, i] -> [i, 1]}")
transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

{
  for (int c1 = 0; c1 <= n; c1 += 1)
    S(c1);
  for (int c1 = 0; c1 <= n; c1 += 1)
    T(c1);
}

After Transform:

for (int c0 = 0; c0 <= n; c0 += 1) {
  S(c0);
  T(c0);
}



# Loop Fission (Loop Distribution)

Loop fission takes two statements that have been originally executed in the same
loop and distributes them to two separate loops.

**Benefits:**
 - Reduces register pressure
 - Enables other transformations, i.e. SIMDization in case only one of
   the two statements in a loop body allows for parallel execution.

In [4]:
domain = isl.UnionSet("[n] -> {S[i] : 0 <= i <= n; T[i] : 0 <= i <= n}")
original = isl.UnionMap("{S[i] -> [i, 0]; T[i] -> [i, 1]}")
transformation = isl.UnionMap("{[i, 0] -> [0, i]; [i, 1] -> [1, i]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 <= n; c0 += 1) {
  S(c0);
  T(c0);
}

After Transform:

{
  for (int c1 = 0; c1 <= n; c1 += 1)
    S(c1);
  for (int c1 = 0; c1 <= n; c1 += 1)
    T(c1);
}



# Loop Interchange

In [5]:
domain = isl.UnionSet("[n,m] -> {S[i,j] : 0 <= i <= n and 0 <= j <= m }")
original = isl.UnionMap("{S[i,j] -> [i, j]}")
transformation = isl.UnionMap("{[i, j] -> [j, i]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 <= n; c0 += 1)
  for (int c1 = 0; c1 <= m; c1 += 1)
    S(c0, c1);

After Transform:

for (int c0 = 0; c0 <= m; c0 += 1)
  for (int c1 = 0; c1 <= n; c1 += 1)
    S(c1, c0);



# Strip Mining

Strip mining partitions a single loop into chunks that are enumerated by two loops.
An outer loop enumerates the individual blocks, whereas the inner loop enumerates
the individual iterations that belong to each block.

**Benefits:**
 - Building block for loop tiling and unroll-and-jam.

In [6]:
domain = isl.UnionSet("{S[i] : 0 <= i < 1024 }")
original = isl.UnionMap("{S[i] -> [i]}")
transformation = isl.UnionMap("{[i] -> [floor(i/4), i % 4]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 <= 1023; c0 += 1)
  S(c0);

After Transform:

for (int c0 = 0; c0 <= 255; c0 += 1)
  for (int c1 = 0; c1 <= 3; c1 += 1)
    S(4 * c0 + c1);



# Loop Tiling

Loop tiling partitions the execution of a multi-dimensional loop into groups, the tiles.
First a set of outer loops enumerate all tiles that must be executed and for each tile
a set of inner loops, the point loops, enumerates the individual points of the tile.

**Benefits:**
 - Increased data-locality
 - More coarse-grained parallelism

In [7]:
domain = isl.UnionSet("{S[i,j] : 0 <= i,j < 1024 }")
original = isl.UnionMap("{S[i,j] -> [i,j]}")
transformation = isl.UnionMap("{[i,j] -> [floor(i/4), i % 4, floor(j/4), j % 4]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 <= 1023; c0 += 1)
  for (int c1 = 0; c1 <= 1023; c1 += 1)
    S(c0, c1);

After Transform:

for (int c0 = 0; c0 <= 255; c0 += 1)
  for (int c1 = 0; c1 <= 3; c1 += 1)
    for (int c2 = 0; c2 <= 255; c2 += 1)
      for (int c3 = 0; c3 <= 3; c3 += 1)
        S(4 * c0 + c1, 4 * c2 + c3);



# Unroll-and-jam

Unroll-and-jam is a combination of strip-mining of the outer loop into a
tile and point loop and then an interchange of the new point loop with
the innermost loop dimension.

**Benefits:**
 - Enables outer loop vectorization

In [8]:
domain = isl.UnionSet("{S[i,j] : 0 <= i,j < 1024 }")
original = isl.UnionMap("{S[i,j] -> [i,j]}")
transformation = isl.UnionMap("{[i,j] -> [floor(i/4), j, i % 4] }")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 <= 1023; c0 += 1)
  for (int c1 = 0; c1 <= 1023; c1 += 1)
    S(c0, c1);

After Transform:

for (int c0 = 0; c0 <= 255; c0 += 1)
  for (int c1 = 0; c1 <= 1023; c1 += 1)
    for (int c2 = 0; c2 <= 3; c2 += 1)
      S(4 * c0 + c2, c1);



# Skewing


In [9]:
domain = isl.UnionSet("[n] -> {S[i,j] : 0 <= i,j < n }")
original = isl.UnionMap("{S[i,j] -> [i,j]}")
transformation = isl.UnionMap("{[i,j] -> [i, i + j]}")

transformed = original.apply_range(transformation)
print_before_after(domain, original, transformed)

Before Transform:

for (int c0 = 0; c0 < n; c0 += 1)
  for (int c1 = 0; c1 < n; c1 += 1)
    S(c0, c1);

After Transform:

for (int c0 = 0; c0 < n; c0 += 1)
  for (int c1 = c0; c1 < n + c0; c1 += 1)
    S(c0, -c0 + c1);

