Skip to content

Commit

Permalink
Slightly faster prefetching perf.
Browse files Browse the repository at this point in the history
  • Loading branch information
paboyle committed Jun 13, 2016
1 parent 55f65b8 commit 87418e7
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 11 deletions.
3 changes: 3 additions & 0 deletions lib/qcd/action/fermion/WilsonKernelsAsmBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,10 @@
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;

PREFETCH_CHIMU(basex);
SAVE_RESULT(&out._odata[ss]);


ss++;
}
sU++;
Expand Down
45 changes: 34 additions & 11 deletions lib/simd/Intel512wilson.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSUB(UChi_02,result_22,result_22)\
VSUB(UChi_12,result_32,result_32) );

//define PREFETCH_CHIMU(A)
#define PREFETCH_CHIMU(A) \
LOAD64(%r9,A) \
__asm__ ( \
VPREFETCHG(12,%r9)\
VPREFETCHG(13,%r9)\
VPREFETCHG(14,%r9)\
VPREFETCHG(15,%r9)\
VPREFETCHG(16,%r9)\
VPREFETCHG(17,%r9)\
VPREFETCHG(18,%r9)\
VPREFETCHG(19,%r9)\
VPREFETCHG(20,%r9)\
VPREFETCHG(21,%r9)\
VPREFETCHG(22,%r9)\
VPREFETCHG(23,%r9));

#define PERMUTE_DIR0 __asm__ ( \
VPERM0(Chi_00,Chi_00) \
Expand Down Expand Up @@ -598,7 +612,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VSHUF(Chi_00,T1) \
VPREFETCH2(9,%r8) \
VPREFETCH2(10,%r8) \
VPREFETCH2(11,%r8) \
VPREFETCH2(12,%r8) \
VPREFETCH2(13,%r8) \
VPREFETCH2(14,%r8) \
VPREFETCH2(15,%r8) \
VPREFETCH2(16,%r8) \
VPREFETCH2(17,%r8) \
VSHUF(Chi_00,T1) \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 ) \
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
Expand Down Expand Up @@ -650,6 +673,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
VMADDSUB(Z5,Chi_11,UChi_12) \
VPREFETCHG(9,%r8) \
VPREFETCHG(10,%r8) \
VPREFETCHG(11,%r8) \
VPREFETCHG(12,%r8) \
VPREFETCHG(13,%r8) \
VPREFETCHG(14,%r8) \
VPREFETCHG(15,%r8) \
VPREFETCHG(16,%r8) \
VPREFETCHG(17,%r8) \
/*48*/ \
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10) \
Expand All @@ -668,15 +700,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VMADDSUB(Z4,Chi_12,UChi_11) \
VMADDSUB(Z5,Chi_02,UChi_02) \
VMADDSUB(Z5,Chi_12,UChi_12) \
VPREFETCHG(9,%r8) \
VPREFETCHG(10,%r8) \
VPREFETCHG(11,%r8) \
VPREFETCHG(12,%r8) \
VPREFETCHG(13,%r8) \
VPREFETCHG(14,%r8) \
VPREFETCHG(15,%r8) \
VPREFETCHG(16,%r8) \
VPREFETCHG(17,%r8) \
/*61 insns*/ );


Expand Down

0 comments on commit 87418e7

Please sign in to comment.