Skip to content

[flang][openmp] inefficient code generation for openmp parallel loops? #168682

@shivaramaarao

Description

@shivaramaarao

consider the following openmp program (reduced from 362.fma3d omp2012 suite)

module submod
      TYPE :: NT
        REAL(KIND(0D0))  Mi
      END TYPE

      TYPE :: MT
        REAL(KIND(0D0))  Ax
        REAL(KIND(0D0))  Ay
        REAL(KIND(0D0))  Az
      END TYPE

      TYPE :: FT
        REAL(KIND(0D0))  Xi
        REAL(KIND(0D0))  Yi
        REAL(KIND(0D0))  Zi
        REAL(KIND(0D0))  Xe
        REAL(KIND(0D0))  Ye
        REAL(KIND(0D0))  Ze
      END TYPE

      TYPE (FT), DIMENSION(:), ALLOCATABLE :: FF
      TYPE (MT), DIMENSION(:), ALLOCATABLE :: MM
      TYPE (NT), DIMENSION(:), ALLOCATABLE :: NN
      INTEGER :: N, NR
end module

SUBROUTINE SUB()
USE submod
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(N)
      DO N = 1,NR
        MM(N)%Ax = NN(N)%Mi * (FF(N)%Xe-FF(N)%Xi)
        MM(N)%Ay = NN(N)%Mi * (FF(N)%Ye-FF(N)%Yi)
        MM(N)%Az = NN(N)%Mi * (FF(N)%Ze-FF(N)%Zi)
      ENDDO
!$OMP END PARALLEL DO
      PRINT *, MM

END SUBROUTINE

The x86 code generation (shown below) for the openmp loop is suboptimal.
$ flang -O3 -fopenmp omp_p.f90

.LBB1_2:
         movslq  %r10d, %r11
         movq    %r11, %r14
         movq    %r11, %r15
         leal    1(%r11), %r10d
         movq    %r11, %r12
         subq    %rdx, %r12
         movsd   (%rcx,%r12,8), %xmm0
         subq    %rdi, %r14
         leaq    (%r14,%r14,2), %r14
         shlq    $4, %r14
         subq    %r9, %r15
         leaq    (%r15,%r15,2), %r15
         movupd  (%rsi,%r14), %xmm1
         movupd  24(%rsi,%r14), %xmm2
         subpd   %xmm1, %xmm2
         movapd  %xmm0, %xmm1
         unpcklpd        %xmm0, %xmm1
         mulpd   %xmm2, %xmm1
         movupd  %xmm1, (%r8,%r15,8)
         movsd   40(%rsi,%r14), %xmm1
         subsd   16(%rsi,%r14), %xmm1
         addl    %eax, %r11d
         incl    %r11d
         mulsd   %xmm0, %xmm1
         movsd   %xmm1, 16(%r8,%r15,8)
         cmpl    $2, %r11d
         jne     .LBB1_2

There are many index calculation expressions in the generated code. when -fopenmp is not used, the code generated doesnt have so many index calculations.

compared to above, the classic flang generated code is better and is shown below.

.LBB2_5:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        .loc    1 53 1 is_stmt 1                # t3.f90:53:1
        movupd  80(%r15,%rbp,2), %xmm2
        movupd  64(%r15,%rbp,2), %xmm3
        movupd  (%r15,%rbp,2), %xmm4
        movupd  16(%r15,%rbp,2), %xmm0
        movupd  32(%r15,%rbp,2), %xmm1
        movupd  48(%r15,%rbp,2), %xmm5
        movapd  %xmm4, %xmm6
        unpcklpd        %xmm5, %xmm6                    # xmm6 = xmm6[0],xmm5[0]
        unpckhpd        %xmm5, %xmm4                    # xmm4 = xmm4[1],xmm5[1]
        movapd  %xmm0, %xmm5
        unpcklpd        %xmm3, %xmm5                    # xmm5 = xmm5[0],xmm3[0]
        unpckhpd        %xmm3, %xmm0                    # xmm0 = xmm0[1],xmm3[1]
        subpd   %xmm6, %xmm0
        movapd  %xmm1, %xmm3
        unpcklpd        %xmm2, %xmm3                    # xmm3 = xmm3[0],xmm2[0]
        .loc    1 54 1                          # t3.f90:54:1
        subpd   %xmm4, %xmm3
        .loc    1 53 1                          # t3.f90:53:1
        unpckhpd        %xmm2, %xmm1                    # xmm1 = xmm1[1],xmm2[1]
        .loc    1 55 1                          # t3.f90:55:1
        subpd   %xmm5, %xmm1
        .loc    1 53 1                          # t3.f90:53:1
        movupd  (%r12,%rcx,8), %xmm2
        mulpd   %xmm2, %xmm0
        .loc    1 54 1                          # t3.f90:54:1
        mulpd   %xmm2, %xmm3
        .loc    1 55 1                          # t3.f90:55:1
        mulpd   %xmm2, %xmm1
        movapd  %xmm0, %xmm2
        unpcklpd        %xmm3, %xmm2                    # xmm2 = xmm2[0],xmm3[0]
        unpckhpd        %xmm1, %xmm3                    # xmm3 = xmm3[1],xmm1[1]
        shufpd  $2, %xmm0, %xmm1                # xmm1 = xmm1[0],xmm0[1]
        movupd  %xmm1, 16(%r13,%rbp)
        movupd  %xmm3, 32(%r13,%rbp)
        movupd  %xmm2, (%r13,%rbp)
        addq    $2, %rcx
        addq    $48, %rbp
        cmpq    %rcx, %r14
        jne     .LBB2_5

As we see there are less index calculations and more compute instructions in classic flang generated code. Any idea why we have many index calculation code in llvm flang generated code?

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions