-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Open
Labels
Description
consider the following openmp program (reduced from 362.fma3d omp2012 suite)
module submod
TYPE :: NT
REAL(KIND(0D0)) Mi
END TYPE
TYPE :: MT
REAL(KIND(0D0)) Ax
REAL(KIND(0D0)) Ay
REAL(KIND(0D0)) Az
END TYPE
TYPE :: FT
REAL(KIND(0D0)) Xi
REAL(KIND(0D0)) Yi
REAL(KIND(0D0)) Zi
REAL(KIND(0D0)) Xe
REAL(KIND(0D0)) Ye
REAL(KIND(0D0)) Ze
END TYPE
TYPE (FT), DIMENSION(:), ALLOCATABLE :: FF
TYPE (MT), DIMENSION(:), ALLOCATABLE :: MM
TYPE (NT), DIMENSION(:), ALLOCATABLE :: NN
INTEGER :: N, NR
end module
SUBROUTINE SUB()
USE submod
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(N)
DO N = 1,NR
MM(N)%Ax = NN(N)%Mi * (FF(N)%Xe-FF(N)%Xi)
MM(N)%Ay = NN(N)%Mi * (FF(N)%Ye-FF(N)%Yi)
MM(N)%Az = NN(N)%Mi * (FF(N)%Ze-FF(N)%Zi)
ENDDO
!$OMP END PARALLEL DO
PRINT *, MM
END SUBROUTINE
The x86 code generation (shown below) for the openmp loop is suboptimal.
$ flang -O3 -fopenmp omp_p.f90
.LBB1_2:
movslq %r10d, %r11
movq %r11, %r14
movq %r11, %r15
leal 1(%r11), %r10d
movq %r11, %r12
subq %rdx, %r12
movsd (%rcx,%r12,8), %xmm0
subq %rdi, %r14
leaq (%r14,%r14,2), %r14
shlq $4, %r14
subq %r9, %r15
leaq (%r15,%r15,2), %r15
movupd (%rsi,%r14), %xmm1
movupd 24(%rsi,%r14), %xmm2
subpd %xmm1, %xmm2
movapd %xmm0, %xmm1
unpcklpd %xmm0, %xmm1
mulpd %xmm2, %xmm1
movupd %xmm1, (%r8,%r15,8)
movsd 40(%rsi,%r14), %xmm1
subsd 16(%rsi,%r14), %xmm1
addl %eax, %r11d
incl %r11d
mulsd %xmm0, %xmm1
movsd %xmm1, 16(%r8,%r15,8)
cmpl $2, %r11d
jne .LBB1_2
There are many index calculation expressions in the generated code. when -fopenmp is not used, the code generated doesnt have so many index calculations.
compared to above, the classic flang generated code is better and is shown below.
.LBB2_5: # %vector.body
# =>This Inner Loop Header: Depth=1
.loc 1 53 1 is_stmt 1 # t3.f90:53:1
movupd 80(%r15,%rbp,2), %xmm2
movupd 64(%r15,%rbp,2), %xmm3
movupd (%r15,%rbp,2), %xmm4
movupd 16(%r15,%rbp,2), %xmm0
movupd 32(%r15,%rbp,2), %xmm1
movupd 48(%r15,%rbp,2), %xmm5
movapd %xmm4, %xmm6
unpcklpd %xmm5, %xmm6 # xmm6 = xmm6[0],xmm5[0]
unpckhpd %xmm5, %xmm4 # xmm4 = xmm4[1],xmm5[1]
movapd %xmm0, %xmm5
unpcklpd %xmm3, %xmm5 # xmm5 = xmm5[0],xmm3[0]
unpckhpd %xmm3, %xmm0 # xmm0 = xmm0[1],xmm3[1]
subpd %xmm6, %xmm0
movapd %xmm1, %xmm3
unpcklpd %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0]
.loc 1 54 1 # t3.f90:54:1
subpd %xmm4, %xmm3
.loc 1 53 1 # t3.f90:53:1
unpckhpd %xmm2, %xmm1 # xmm1 = xmm1[1],xmm2[1]
.loc 1 55 1 # t3.f90:55:1
subpd %xmm5, %xmm1
.loc 1 53 1 # t3.f90:53:1
movupd (%r12,%rcx,8), %xmm2
mulpd %xmm2, %xmm0
.loc 1 54 1 # t3.f90:54:1
mulpd %xmm2, %xmm3
.loc 1 55 1 # t3.f90:55:1
mulpd %xmm2, %xmm1
movapd %xmm0, %xmm2
unpcklpd %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[0]
unpckhpd %xmm1, %xmm3 # xmm3 = xmm3[1],xmm1[1]
shufpd $2, %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[1]
movupd %xmm1, 16(%r13,%rbp)
movupd %xmm3, 32(%r13,%rbp)
movupd %xmm2, (%r13,%rbp)
addq $2, %rcx
addq $48, %rbp
cmpq %rcx, %r14
jne .LBB2_5
As we see there are less index calculations and more compute instructions in classic flang generated code. Any idea why we have many index calculation code in llvm flang generated code?