-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Open
Labels
Description
Compile command: -Ofast -Rpass=loop-unroll -mcpu=cortex-a57
For kernels , like
for (i = 0; i < n; i++) {
if (e[i] >= t)
a[i] = a[i] + c[i] * d[i];
}
Where a,b,c,d are arrays. loop unrolling doesn't happen (unroll count comes to 0 or 1). But if unrolling is enabled using -unroll-count=UF where UF=4 provides optimal performance (~25% gain).
Currently the IR [without force unroll] looks like->
%8 = icmp sgt i32 %0, 0, !dbg !29
br i1 %8, label %9, label %28, !dbg !32
9:
%10 = zext nneg i32 %0 to i64, !dbg !29
br label %11, !dbg !32
11:
%12 = phi i64 [ 0, %9 ], [ %26, %25 ]
#dbg_value(i64 %12, !27, !DIExpression(), !28)
%13 = getelementptr inbounds nuw i8, ptr %5, i64 %12, !dbg !33
%14 = load i8, ptr %13, align 1, !dbg !33
%15 = icmp ult i8 %14, %6, !dbg !39
br i1 %15, label %25, label %16, !dbg !39
16:
%17 = getelementptr inbounds nuw i8, ptr %1, i64 %12, !dbg !40
%18 = load i8, ptr %17, align 1, !dbg !40
%19 = getelementptr inbounds nuw i8, ptr %3, i64 %12, !dbg !42
%20 = load i8, ptr %19, align 1, !dbg !42
%21 = getelementptr inbounds nuw i8, ptr %4, i64 %12, !dbg !43
%22 = load i8, ptr %21, align 1, !dbg !43
%23 = mul i8 %22, %20, !dbg !44
%24 = add i8 %23, %18, !dbg !45
store i8 %24, ptr %17, align 1, !dbg !46
br label %25, !dbg !47
25:
%26 = add nuw nsw i64 %12, 1, !dbg !48
#dbg_value(i64 %26, !27, !DIExpression(), !28)
%27 = icmp eq i64 %26, %10, !dbg !29
br i1 %27, label %28, label %11, !dbg !32
After force unrolling via -unroll-count=UF
%8 = icmp sgt i32 %0, 0, !dbg !29
br i1 %8, label %9, label %98, !dbg !32
9:
%10 = zext nneg i32 %0 to i64, !dbg !29
%11 = and i64 %10, 3, !dbg !32
%12 = icmp ult i32 %0, 4, !dbg !32
br i1 %12, label %76, label %13, !dbg !32
13:
%14 = and i64 %10, 2147483644, !dbg !32
br label %15, !dbg !32
15:
%16 = phi i64 [ 0, %13 ], [ %73, %72 ]
%17 = phi i64 [ 0, %13 ], [ %74, %72 ]
#dbg_value(i64 %16, !27, !DIExpression(), !28)
%18 = getelementptr inbounds nuw i8, ptr %5, i64 %16, !dbg !33
%19 = load i8, ptr %18, align 1, !dbg !33
%20 = icmp ult i8 %19, %6, !dbg !39
br i1 %20, label %30, label %21, !dbg !39
21:
%22 = getelementptr inbounds nuw i8, ptr %1, i64 %16, !dbg !40
%23 = load i8, ptr %22, align 1, !dbg !40
%24 = getelementptr inbounds nuw i8, ptr %3, i64 %16, !dbg !42
%25 = load i8, ptr %24, align 1, !dbg !42
%26 = getelementptr inbounds nuw i8, ptr %4, i64 %16, !dbg !43
%27 = load i8, ptr %26, align 1, !dbg !43
%28 = mul i8 %27, %25, !dbg !44
%29 = add i8 %28, %23, !dbg !45
store i8 %29, ptr %22, align 1, !dbg !46
br label %30, !dbg !47
30:
%31 = or disjoint i64 %16, 1, !dbg !48
#dbg_value(i64 %31, !27, !DIExpression(), !28)
%32 = getelementptr inbounds nuw i8, ptr %5, i64 %31, !dbg !33
%33 = load i8, ptr %32, align 1, !dbg !33
%34 = icmp ult i8 %33, %6, !dbg !39
br i1 %34, label %44, label %35, !dbg !39
35:
%36 = getelementptr inbounds nuw i8, ptr %1, i64 %31, !dbg !40
%37 = load i8, ptr %36, align 1, !dbg !40
%38 = getelementptr inbounds nuw i8, ptr %3, i64 %31, !dbg !42
%39 = load i8, ptr %38, align 1, !dbg !42
%40 = getelementptr inbounds nuw i8, ptr %4, i64 %31, !dbg !43
%41 = load i8, ptr %40, align 1, !dbg !43
%42 = mul i8 %41, %39, !dbg !44
%43 = add i8 %42, %37, !dbg !45
store i8 %43, ptr %36, align 1, !dbg !46
br label %44, !dbg !47
44:
%45 = or disjoint i64 %16, 2, !dbg !48
#dbg_value(i64 %45, !27, !DIExpression(), !28)
%46 = getelementptr inbounds nuw i8, ptr %5, i64 %45, !dbg !33
%47 = load i8, ptr %46, align 1, !dbg !33
%48 = icmp ult i8 %47, %6, !dbg !39
br i1 %48, label %58, label %49, !dbg !39
49:
%50 = getelementptr inbounds nuw i8, ptr %1, i64 %45, !dbg !40
%51 = load i8, ptr %50, align 1, !dbg !40
%52 = getelementptr inbounds nuw i8, ptr %3, i64 %45, !dbg !42
%53 = load i8, ptr %52, align 1, !dbg !42
%54 = getelementptr inbounds nuw i8, ptr %4, i64 %45, !dbg !43
%55 = load i8, ptr %54, align 1, !dbg !43
%56 = mul i8 %55, %53, !dbg !44
%57 = add i8 %56, %51, !dbg !45
store i8 %57, ptr %50, align 1, !dbg !46
br label %58, !dbg !47
58:
%59 = or disjoint i64 %16, 3, !dbg !48
#dbg_value(i64 %59, !27, !DIExpression(), !28)
%60 = getelementptr inbounds nuw i8, ptr %5, i64 %59, !dbg !33
%61 = load i8, ptr %60, align 1, !dbg !33
%62 = icmp ult i8 %61, %6, !dbg !39
br i1 %62, label %72, label %63, !dbg !39
63:
%64 = getelementptr inbounds nuw i8, ptr %1, i64 %59, !dbg !40
%65 = load i8, ptr %64, align 1, !dbg !40
%66 = getelementptr inbounds nuw i8, ptr %3, i64 %59, !dbg !42
%67 = load i8, ptr %66, align 1, !dbg !42
%68 = getelementptr inbounds nuw i8, ptr %4, i64 %59, !dbg !43
%69 = load i8, ptr %68, align 1, !dbg !43
%70 = mul i8 %69, %67, !dbg !44
%71 = add i8 %70, %65, !dbg !45
store i8 %71, ptr %64, align 1, !dbg !46
br label %72, !dbg !47
link of reproducer .