Skip to content

[AArch64] No unrolling happens even when its performant #168021

@ShikharjQUIC

Description

@ShikharjQUIC

Compile command: -Ofast -Rpass=loop-unroll -mcpu=cortex-a57
For kernels , like

for (i = 0; i < n; i++) {
  if (e[i] >= t) 
    a[i] = a[i] + c[i] * d[i];               
  } 

Where a,b,c,d are arrays. loop unrolling doesn't happen (unroll count comes to 0 or 1). But if unrolling is enabled using -unroll-count=UF where UF=4 provides optimal performance (~25% gain).

Currently the IR [without force unroll] looks like->

%8 = icmp sgt i32 %0, 0, !dbg !29
 br i1 %8, label %9, label %28, !dbg !32

9:
 %10 = zext nneg i32 %0 to i64, !dbg !29
 br label %11, !dbg !32

11:
 %12 = phi i64 [ 0, %9 ], [ %26, %25 ]
   #dbg_value(i64 %12, !27, !DIExpression(), !28)
 %13 = getelementptr inbounds nuw i8, ptr %5, i64 %12, !dbg !33
 %14 = load i8, ptr %13, align 1, !dbg !33
 %15 = icmp ult i8 %14, %6, !dbg !39
 br i1 %15, label %25, label %16, !dbg !39

16:
 %17 = getelementptr inbounds nuw i8, ptr %1, i64 %12, !dbg !40
 %18 = load i8, ptr %17, align 1, !dbg !40
 %19 = getelementptr inbounds nuw i8, ptr %3, i64 %12, !dbg !42
 %20 = load i8, ptr %19, align 1, !dbg !42
 %21 = getelementptr inbounds nuw i8, ptr %4, i64 %12, !dbg !43
 %22 = load i8, ptr %21, align 1, !dbg !43
 %23 = mul i8 %22, %20, !dbg !44
 %24 = add i8 %23, %18, !dbg !45
 store i8 %24, ptr %17, align 1, !dbg !46
 br label %25, !dbg !47

25:
 %26 = add nuw nsw i64 %12, 1, !dbg !48
   #dbg_value(i64 %26, !27, !DIExpression(), !28)
 %27 = icmp eq i64 %26, %10, !dbg !29
 br i1 %27, label %28, label %11, !dbg !32

After force unrolling via -unroll-count=UF

%8 = icmp sgt i32 %0, 0, !dbg !29
  br i1 %8, label %9, label %98, !dbg !32

9:
  %10 = zext nneg i32 %0 to i64, !dbg !29
  %11 = and i64 %10, 3, !dbg !32
  %12 = icmp ult i32 %0, 4, !dbg !32
  br i1 %12, label %76, label %13, !dbg !32

13:
  %14 = and i64 %10, 2147483644, !dbg !32
  br label %15, !dbg !32

15:
  %16 = phi i64 [ 0, %13 ], [ %73, %72 ]
  %17 = phi i64 [ 0, %13 ], [ %74, %72 ]
    #dbg_value(i64 %16, !27, !DIExpression(), !28)
  %18 = getelementptr inbounds nuw i8, ptr %5, i64 %16, !dbg !33
  %19 = load i8, ptr %18, align 1, !dbg !33
  %20 = icmp ult i8 %19, %6, !dbg !39
  br i1 %20, label %30, label %21, !dbg !39

21:
  %22 = getelementptr inbounds nuw i8, ptr %1, i64 %16, !dbg !40
  %23 = load i8, ptr %22, align 1, !dbg !40
  %24 = getelementptr inbounds nuw i8, ptr %3, i64 %16, !dbg !42
  %25 = load i8, ptr %24, align 1, !dbg !42
  %26 = getelementptr inbounds nuw i8, ptr %4, i64 %16, !dbg !43
  %27 = load i8, ptr %26, align 1, !dbg !43
  %28 = mul i8 %27, %25, !dbg !44
  %29 = add i8 %28, %23, !dbg !45
  store i8 %29, ptr %22, align 1, !dbg !46
  br label %30, !dbg !47

30:
  %31 = or disjoint i64 %16, 1, !dbg !48
    #dbg_value(i64 %31, !27, !DIExpression(), !28)
  %32 = getelementptr inbounds nuw i8, ptr %5, i64 %31, !dbg !33
  %33 = load i8, ptr %32, align 1, !dbg !33
  %34 = icmp ult i8 %33, %6, !dbg !39
  br i1 %34, label %44, label %35, !dbg !39

35:
  %36 = getelementptr inbounds nuw i8, ptr %1, i64 %31, !dbg !40
  %37 = load i8, ptr %36, align 1, !dbg !40
  %38 = getelementptr inbounds nuw i8, ptr %3, i64 %31, !dbg !42
  %39 = load i8, ptr %38, align 1, !dbg !42
  %40 = getelementptr inbounds nuw i8, ptr %4, i64 %31, !dbg !43
  %41 = load i8, ptr %40, align 1, !dbg !43
  %42 = mul i8 %41, %39, !dbg !44
  %43 = add i8 %42, %37, !dbg !45
  store i8 %43, ptr %36, align 1, !dbg !46
  br label %44, !dbg !47

44:
  %45 = or disjoint i64 %16, 2, !dbg !48
    #dbg_value(i64 %45, !27, !DIExpression(), !28)
  %46 = getelementptr inbounds nuw i8, ptr %5, i64 %45, !dbg !33
  %47 = load i8, ptr %46, align 1, !dbg !33
  %48 = icmp ult i8 %47, %6, !dbg !39
  br i1 %48, label %58, label %49, !dbg !39

49:
  %50 = getelementptr inbounds nuw i8, ptr %1, i64 %45, !dbg !40
  %51 = load i8, ptr %50, align 1, !dbg !40
  %52 = getelementptr inbounds nuw i8, ptr %3, i64 %45, !dbg !42
  %53 = load i8, ptr %52, align 1, !dbg !42
  %54 = getelementptr inbounds nuw i8, ptr %4, i64 %45, !dbg !43
  %55 = load i8, ptr %54, align 1, !dbg !43
  %56 = mul i8 %55, %53, !dbg !44
  %57 = add i8 %56, %51, !dbg !45
  store i8 %57, ptr %50, align 1, !dbg !46
  br label %58, !dbg !47

58:
  %59 = or disjoint i64 %16, 3, !dbg !48
    #dbg_value(i64 %59, !27, !DIExpression(), !28)
  %60 = getelementptr inbounds nuw i8, ptr %5, i64 %59, !dbg !33
  %61 = load i8, ptr %60, align 1, !dbg !33
  %62 = icmp ult i8 %61, %6, !dbg !39
  br i1 %62, label %72, label %63, !dbg !39

63:
  %64 = getelementptr inbounds nuw i8, ptr %1, i64 %59, !dbg !40
  %65 = load i8, ptr %64, align 1, !dbg !40
  %66 = getelementptr inbounds nuw i8, ptr %3, i64 %59, !dbg !42
  %67 = load i8, ptr %66, align 1, !dbg !42
  %68 = getelementptr inbounds nuw i8, ptr %4, i64 %59, !dbg !43
  %69 = load i8, ptr %68, align 1, !dbg !43
  %70 = mul i8 %69, %67, !dbg !44
  %71 = add i8 %70, %65, !dbg !45
  store i8 %71, ptr %64, align 1, !dbg !46
  br label %72, !dbg !47

link of reproducer .

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions