Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coalesce some writes in RespWriteUtils #197

Merged
merged 3 commits into from
Apr 1, 2024

Conversation

PaulusParssinen
Copy link
Contributor

@PaulusParssinen PaulusParssinen commented Mar 29, 2024

Help JIT to coalesce some word and dword sized stores in RespWriteUtils.

Sample diffs

Garnet.common.RespWriteUtils:WriteNull
 ; Assembly listing for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Windows
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
+; 1 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T00] (  9,  6   )   byref  ->  rcx         single-def
+;  V00 arg0         [V00,T00] (  7,  5   )   byref  ->  rcx         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )    long  ->  rdx         single-def
-;  V02 loc0         [V02,T02] (  9,  4.50)    long  ->   r8        
-;  V03 OutArgs      [V03    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V04 cse0         [V04,T03] (  3,  2.50)    long  ->   r8         "CSE #01: aggressive"
+;  V02 loc0         [V02,T04] (  3,  1.50)    long  ->   r8         single-def
+;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;* V04 tmp1         [V04    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V05 tmp2         [V05    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V06 tmp3         [V06,T03] (  2,  2   )    long  ->  rax         "impAppendStmt"
+;* V07 tmp4         [V07    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V08 tmp5         [V08    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V04._reference (fldOffset=0x0)" P-INDEP
+;* V09 tmp6         [V09    ] (  0,  0   )     int  ->  zero-ref    single-def "field V04._length (fldOffset=0x8)" P-INDEP
+;* V10 tmp7         [V10    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V05._reference (fldOffset=0x0)" P-INDEP
+;* V11 tmp8         [V11    ] (  0,  0   )     int  ->  zero-ref    single-def "field V05._length (fldOffset=0x8)" P-INDEP
+;* V12 tmp9         [V12    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V07._reference (fldOffset=0x0)" P-INDEP
+;* V13 tmp10        [V13    ] (  0,  0   )     int  ->  zero-ref    "field V07._length (fldOffset=0x8)" P-INDEP
+;  V14 cse0         [V14,T02] (  3,  2.50)    long  ->   r8         "CSE #01: aggressive"
 ;
-; Lcl frame size = 40
+; Lcl frame size = 0
 
 G_M4163_IG01:  ;; offset=0x0000
-       sub      rsp, 40
-						;; size=4 bbWeight=1 PerfScore 0.25
-G_M4163_IG02:  ;; offset=0x0004
+						;; size=0 bbWeight=1 PerfScore 0.00
+G_M4163_IG02:  ;; offset=0x0000
        mov      r8, qword ptr [rcx]
        sub      rdx, r8
        cmp      edx, 5
        jge      SHORT G_M4163_IG05
 						;; size=11 bbWeight=1 PerfScore 3.50
-G_M4163_IG03:  ;; offset=0x000F
+G_M4163_IG03:  ;; offset=0x000B
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
-G_M4163_IG04:  ;; offset=0x0011
-       add      rsp, 40
+G_M4163_IG04:  ;; offset=0x000D
        ret      
-						;; size=5 bbWeight=0.50 PerfScore 0.62
-G_M4163_IG05:  ;; offset=0x0016
+						;; size=1 bbWeight=0.50 PerfScore 0.50
+G_M4163_IG05:  ;; offset=0x000E
        lea      rax, [r8+0x01]
        mov      qword ptr [rcx], rax
        mov      byte  ptr [r8], 36
-       mov      r8, qword ptr [rcx]
-       lea      rax, [r8+0x01]
-       mov      qword ptr [rcx], rax
-       mov      byte  ptr [r8], 45
-       mov      r8, qword ptr [rcx]
-       lea      rax, [r8+0x01]
-       mov      qword ptr [rcx], rax
-       mov      byte  ptr [r8], 49
-       call     [Garnet.common.RespWriteUtils:WriteNewline(byref)]
+       mov      rax, qword ptr [rcx]
+       mov      dword ptr [rax], 0xA0D312D
+       add      qword ptr [rcx], 4
        mov      eax, 1
-						;; size=50 bbWeight=0.50 PerfScore 7.38
-G_M4163_IG06:  ;; offset=0x0048
-       add      rsp, 40
+						;; size=29 bbWeight=0.50 PerfScore 4.38
+G_M4163_IG06:  ;; offset=0x002B
        ret      
-						;; size=5 bbWeight=0.50 PerfScore 0.62
+						;; size=1 bbWeight=0.50 PerfScore 0.50
 
-; Total bytes of code 77, prolog size 4, PerfScore 12.50, instruction count 23, allocated bytes for code 77 (MethodHash=bc12efbc) for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
+; Total bytes of code 44, prolog size 0, PerfScore 9.00, instruction count 14, allocated bytes for code 44 (MethodHash=bc12efbc) for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
 ; ============================================================
Garnet.common.RespWriteUtils:WriteIntegerAsBulkString
 ; Assembly listing for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Windows
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
+; 2 inlinees with PGO data; 8 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T02] (  5,  4.50)     int  ->  rsi         single-def
-;  V01 arg1         [V01,T00] ( 14,  8.50)   byref  ->  rbx         single-def
-;  V02 arg2         [V02,T03] (  3,  3   )    long  ->  rdi         single-def
-;  V03 loc0         [V03,T04] (  5,  4   )     int  ->  rbp         single-def
-;  V04 loc1         [V04,T05] (  4,  3.50)   ubyte  ->  r14         single-def
-;  V05 loc2         [V05,T06] (  3,  2.50)     int  ->  rdx         single-def
-;  V06 loc3         [V06,T01] ( 15,  7.50)    long  ->  r10        
+;  V00 arg0         [V00,T01] (  5,  4.50)     int  ->  rsi         single-def
+;  V01 arg1         [V01,T00] ( 12,  7.50)   byref  ->  rbx         single-def
+;  V02 arg2         [V02,T02] (  3,  3   )    long  ->  rdi         single-def
+;  V03 loc0         [V03,T03] (  4,  3.50)     int  ->  rbp         single-def
+;  V04 loc1         [V04,T04] (  3,  3   )   ubyte  ->  r14         single-def
+;  V05 loc2         [V05,T05] (  3,  2.50)     int  ->  rdx         single-def
+;  V06 loc3         [V06,T10] (  3,  1.50)    long  ->  rcx         single-def
 ;  V07 OutArgs      [V07    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V08 cse0         [V08,T07] (  3,  2.50)    long  ->  r10         "CSE #01: aggressive"
+;* V08 tmp1         [V08    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V09 tmp2         [V09    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V10 tmp3         [V10,T08] (  2,  2   )    long  ->  rcx         "impAppendStmt"
+;* V11 tmp4         [V11    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V12 tmp5         [V12    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V13 tmp6         [V13    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V14 tmp7         [V14,T09] (  2,  2   )    long  ->  rax         "impAppendStmt"
+;* V15 tmp8         [V15    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V16 tmp9         [V16    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V08._reference (fldOffset=0x0)" P-INDEP
+;* V17 tmp10        [V17    ] (  0,  0   )     int  ->  zero-ref    single-def "field V08._length (fldOffset=0x8)" P-INDEP
+;* V18 tmp11        [V18    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V09._reference (fldOffset=0x0)" P-INDEP
+;* V19 tmp12        [V19    ] (  0,  0   )     int  ->  zero-ref    single-def "field V09._length (fldOffset=0x8)" P-INDEP
+;* V20 tmp13        [V20    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V11._reference (fldOffset=0x0)" P-INDEP
+;* V21 tmp14        [V21    ] (  0,  0   )     int  ->  zero-ref    "field V11._length (fldOffset=0x8)" P-INDEP
+;* V22 tmp15        [V22    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V12._reference (fldOffset=0x0)" P-INDEP
+;* V23 tmp16        [V23    ] (  0,  0   )     int  ->  zero-ref    single-def "field V12._length (fldOffset=0x8)" P-INDEP
+;* V24 tmp17        [V24    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V13._reference (fldOffset=0x0)" P-INDEP
+;* V25 tmp18        [V25    ] (  0,  0   )     int  ->  zero-ref    single-def "field V13._length (fldOffset=0x8)" P-INDEP
+;* V26 tmp19        [V26    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V15._reference (fldOffset=0x0)" P-INDEP
+;* V27 tmp20        [V27    ] (  0,  0   )     int  ->  zero-ref    "field V15._length (fldOffset=0x8)" P-INDEP
+;  V28 cse0         [V28,T06] (  3,  2.50)    long  ->  rcx         "CSE #01: aggressive"
+;  V29 cse1         [V29,T07] (  3,  2.50)     int  ->  r15         "CSE #02: aggressive"
 ;
-; Lcl frame size = 32
+; Lcl frame size = 40
 
 G_M4598_IG01:  ;; offset=0x0000
+       push     r15
        push     r14
        push     rdi
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 32
+       sub      rsp, 40
        mov      esi, ecx
        mov      rbx, rdx
        mov      rdi, r8
-						;; size=18 bbWeight=1 PerfScore 6.00
-G_M4598_IG02:  ;; offset=0x0012
+						;; size=20 bbWeight=1 PerfScore 7.00
+G_M4598_IG02:  ;; offset=0x0014
        movsxd   rcx, esi
        call     [Garnet.common.NumUtils:NumDigitsInLong(long):int]
        mov      ebp, eax
        mov      r14d, esi
        shr      r14d, 31
-       lea      ecx, [r14+rbp]
+       lea      r15d, [r14+rbp]
+       mov      ecx, r15d
        call     [Garnet.common.NumUtils:NumDigits(int):int]
        mov      edx, eax
-       lea      eax, [rdx+r14]
-       lea      eax, [rax+rbp+0x05]
-       mov      r10, qword ptr [rbx]
-       sub      rdi, r10
+       add      r14d, edx
+       lea      eax, [r14+rbp+0x05]
+       mov      rcx, qword ptr [rbx]
+       sub      rdi, rcx
        cmp      eax, edi
        jle      SHORT G_M4598_IG05
-						;; size=48 bbWeight=1 PerfScore 13.00
-G_M4598_IG03:  ;; offset=0x0042
+						;; size=51 bbWeight=1 PerfScore 13.00
+G_M4598_IG03:  ;; offset=0x0047
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
-G_M4598_IG04:  ;; offset=0x0044
-       add      rsp, 32
+G_M4598_IG04:  ;; offset=0x0049
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
+       pop      r15
        ret      
-						;; size=11 bbWeight=0.50 PerfScore 1.88
-G_M4598_IG05:  ;; offset=0x004F
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 36
-       lea      ecx, [r14+rbp]
+						;; size=13 bbWeight=0.50 PerfScore 2.12
+G_M4598_IG05:  ;; offset=0x0056
+       lea      r8, [rcx+0x01]
+       mov      qword ptr [rbx], r8
+       mov      byte  ptr [rcx], 36
+       mov      ecx, r15d
        mov      r8, rbx
        call     [Garnet.common.NumUtils:IntToBytes(int,int,byref)]
-       mov      r10, qword ptr [rbx]
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 13
-       mov      r10, qword ptr [rbx]
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 10
+       mov      rcx, qword ptr [rbx]
+       mov      word  ptr [rcx], 0xA0D
+       add      qword ptr [rbx], 2
        mov      ecx, esi
        mov      edx, ebp
        mov      r8, rbx
        call     [Garnet.common.NumUtils:IntToBytes(int,int,byref)]
-       mov      r10, qword ptr [rbx]
-       lea      rax, [r10+0x01]
-       mov      qword ptr [rbx], rax
-       mov      byte  ptr [r10], 13
-       mov      r10, qword ptr [rbx]
-       lea      rax, [r10+0x01]
-       mov      qword ptr [rbx], rax
-       mov      byte  ptr [r10], 10
+       mov      rax, qword ptr [rbx]
+       mov      word  ptr [rax], 0xA0D
+       add      qword ptr [rbx], 2
        mov      eax, 1
-						;; size=98 bbWeight=0.50 PerfScore 14.12
-G_M4598_IG06:  ;; offset=0x00B1
-       add      rsp, 32
+						;; size=64 bbWeight=0.50 PerfScore 11.00
+G_M4598_IG06:  ;; offset=0x0096
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
+       pop      r15
        ret      
-						;; size=11 bbWeight=0.50 PerfScore 1.88
+						;; size=13 bbWeight=0.50 PerfScore 2.12
 
-; Total bytes of code 188, prolog size 10, PerfScore 37.00, instruction count 65, allocated bytes for code 188 (MethodHash=cb87ee09) for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
+; Total bytes of code 163, prolog size 12, PerfScore 35.38, instruction count 59, allocated bytes for code 163 (MethodHash=cb87ee09) for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
+; ============================================================

@PaulusParssinen PaulusParssinen changed the title Coalesce some writes in RespWriteUtils.cs Coalesce some writes in RespWriteUtils Mar 29, 2024
@badrishc
Copy link
Contributor

Very cool!!

@badrishc badrishc merged commit 484c576 into microsoft:main Apr 1, 2024
21 checks passed
@PaulusParssinen PaulusParssinen deleted the coalesce-stores branch April 1, 2024 10:48
@github-actions github-actions bot locked and limited conversation to collaborators Jun 1, 2024
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants