[flang][openmp] inefficient code generation for openmp parallel loops?

consider the following openmp program (reduced from 362.fma3d omp2012 suite)
```
module submod
      TYPE :: NT
        REAL(KIND(0D0))  Mi
      END TYPE

      TYPE :: MT
        REAL(KIND(0D0))  Ax
        REAL(KIND(0D0))  Ay
        REAL(KIND(0D0))  Az
      END TYPE

      TYPE :: FT
        REAL(KIND(0D0))  Xi
        REAL(KIND(0D0))  Yi
        REAL(KIND(0D0))  Zi
        REAL(KIND(0D0))  Xe
        REAL(KIND(0D0))  Ye
        REAL(KIND(0D0))  Ze
      END TYPE

      TYPE (FT), DIMENSION(:), ALLOCATABLE :: FF
      TYPE (MT), DIMENSION(:), ALLOCATABLE :: MM
      TYPE (NT), DIMENSION(:), ALLOCATABLE :: NN
      INTEGER :: N, NR
end module

SUBROUTINE SUB()
USE submod
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(N)
      DO N = 1,NR
        MM(N)%Ax = NN(N)%Mi * (FF(N)%Xe-FF(N)%Xi)
        MM(N)%Ay = NN(N)%Mi * (FF(N)%Ye-FF(N)%Yi)
        MM(N)%Az = NN(N)%Mi * (FF(N)%Ze-FF(N)%Zi)
      ENDDO
!$OMP END PARALLEL DO
      PRINT *, MM

END SUBROUTINE
```

The x86 code generation (shown below) for the openmp loop is suboptimal.
$ flang -O3 -fopenmp omp_p.f90

```
.LBB1_2:
         movslq  %r10d, %r11
         movq    %r11, %r14
         movq    %r11, %r15
         leal    1(%r11), %r10d
         movq    %r11, %r12
         subq    %rdx, %r12
         movsd   (%rcx,%r12,8), %xmm0
         subq    %rdi, %r14
         leaq    (%r14,%r14,2), %r14
         shlq    $4, %r14
         subq    %r9, %r15
         leaq    (%r15,%r15,2), %r15
         movupd  (%rsi,%r14), %xmm1
         movupd  24(%rsi,%r14), %xmm2
         subpd   %xmm1, %xmm2
         movapd  %xmm0, %xmm1
         unpcklpd        %xmm0, %xmm1
         mulpd   %xmm2, %xmm1
         movupd  %xmm1, (%r8,%r15,8)
         movsd   40(%rsi,%r14), %xmm1
         subsd   16(%rsi,%r14), %xmm1
         addl    %eax, %r11d
         incl    %r11d
         mulsd   %xmm0, %xmm1
         movsd   %xmm1, 16(%r8,%r15,8)
         cmpl    $2, %r11d
         jne     .LBB1_2
```
There are many index calculation expressions in the generated code. when -fopenmp is not used, the code generated doesnt have so many index calculations.

compared to above, the classic flang generated code is better and is shown below.
```
.LBB2_5:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        .loc    1 53 1 is_stmt 1                # t3.f90:53:1
        movupd  80(%r15,%rbp,2), %xmm2
        movupd  64(%r15,%rbp,2), %xmm3
        movupd  (%r15,%rbp,2), %xmm4
        movupd  16(%r15,%rbp,2), %xmm0
        movupd  32(%r15,%rbp,2), %xmm1
        movupd  48(%r15,%rbp,2), %xmm5
        movapd  %xmm4, %xmm6
        unpcklpd        %xmm5, %xmm6                    # xmm6 = xmm6[0],xmm5[0]
        unpckhpd        %xmm5, %xmm4                    # xmm4 = xmm4[1],xmm5[1]
        movapd  %xmm0, %xmm5
        unpcklpd        %xmm3, %xmm5                    # xmm5 = xmm5[0],xmm3[0]
        unpckhpd        %xmm3, %xmm0                    # xmm0 = xmm0[1],xmm3[1]
        subpd   %xmm6, %xmm0
        movapd  %xmm1, %xmm3
        unpcklpd        %xmm2, %xmm3                    # xmm3 = xmm3[0],xmm2[0]
        .loc    1 54 1                          # t3.f90:54:1
        subpd   %xmm4, %xmm3
        .loc    1 53 1                          # t3.f90:53:1
        unpckhpd        %xmm2, %xmm1                    # xmm1 = xmm1[1],xmm2[1]
        .loc    1 55 1                          # t3.f90:55:1
        subpd   %xmm5, %xmm1
        .loc    1 53 1                          # t3.f90:53:1
        movupd  (%r12,%rcx,8), %xmm2
        mulpd   %xmm2, %xmm0
        .loc    1 54 1                          # t3.f90:54:1
        mulpd   %xmm2, %xmm3
        .loc    1 55 1                          # t3.f90:55:1
        mulpd   %xmm2, %xmm1
        movapd  %xmm0, %xmm2
        unpcklpd        %xmm3, %xmm2                    # xmm2 = xmm2[0],xmm3[0]
        unpckhpd        %xmm1, %xmm3                    # xmm3 = xmm3[1],xmm1[1]
        shufpd  $2, %xmm0, %xmm1                # xmm1 = xmm1[0],xmm0[1]
        movupd  %xmm1, 16(%r13,%rbp)
        movupd  %xmm3, 32(%r13,%rbp)
        movupd  %xmm2, (%r13,%rbp)
        addq    $2, %rcx
        addq    $48, %rbp
        cmpq    %rcx, %r14
        jne     .LBB2_5
```

As we see there are less index calculations and more compute instructions in classic flang generated code. Any idea why we have many index calculation code in llvm flang generated code?


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][openmp] inefficient code generation for openmp parallel loops? #168682

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[flang][openmp] inefficient code generation for openmp parallel loops? #168682

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions