Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PPC] Movemasks should be emulated without resorting to bit-by-bit logic #90554

Open
Validark opened this issue Apr 30, 2024 · 3 comments
Open

Comments

@Validark
Copy link

Godbolt link

export fn maskForUnderscores(src: [*]const u8) u16 {
    return @bitCast(src[0..@sizeOf(Chunk)].* == @as(@Vector(@sizeOf(Chunk), u8), @splat('_')));
}

Gives:

maskForUnderscores:
        stwu 1, -48(1)
        lxv 34, 0(3)
        xxspltib 35, 95
        vcmpequb 2, 2, 3
        stxv 34, 16(1)
        lbz 3, 31(1)
        lbz 4, 30(1)
        clrlwi  3, 3, 31
        rlwimi 3, 4, 1, 30, 30
        lbz 4, 29(1)
        rlwimi 3, 4, 2, 29, 29
        lbz 4, 28(1)
        rlwimi 3, 4, 3, 28, 28
        lbz 4, 27(1)
        rlwimi 3, 4, 4, 27, 27
        lbz 4, 26(1)
        rlwimi 3, 4, 5, 26, 26
        lbz 4, 25(1)
        rlwimi 3, 4, 6, 25, 25
        lbz 4, 24(1)
        rlwimi 3, 4, 7, 24, 24
        lbz 4, 23(1)
        rlwimi 3, 4, 8, 23, 23
        lbz 4, 22(1)
        rlwimi 3, 4, 9, 22, 22
        lbz 4, 21(1)
        rlwimi 3, 4, 10, 21, 21
        lbz 4, 20(1)
        rlwimi 3, 4, 11, 20, 20
        lbz 4, 19(1)
        rlwimi 3, 4, 12, 19, 19
        lbz 4, 18(1)
        rlwimi 3, 4, 13, 18, 18
        lbz 4, 17(1)
        rlwimi 3, 4, 14, 17, 17
        lbz 4, 16(1)
        rlwimi 3, 4, 15, 16, 16
        addi 1, 1, 48
        blr

Here is one way of compiling a movemask on PowerPC: https://godbolt.org/z/shhcqE7G9

bar(float __vector(4)):
.LCF0:
0:      addis 2,12,.TOC.-.LCF0@ha
        addi 2,2,.TOC.-.LCF0@l
        addis 9,2,.LC0@toc@ha
        addi 9,9,.LC0@toc@l
        lvx 0,0,9
        vbpermq 2,2,0
        mfvsrd 3,34
        extsw 3,3
        blr
        .long 0
        .byte 0,9,0,0,0,0,0,0
.LC0:
        .byte   120
        .byte   112
        .byte   104
        .byte   96
        .byte   88
        .byte   80
        .byte   72
        .byte   64
        .byte   56
        .byte   48
        .byte   40
        .byte   32
        .byte   24
        .byte   16
        .byte   8
        .byte   0
@llvmbot
Copy link
Collaborator

llvmbot commented Apr 30, 2024

@llvm/issue-subscribers-backend-powerpc

Author: Niles Salter (Validark)

[Godbolt link](https://zig.godbolt.org/z/MxTW9d6Kr) ```zig export fn maskForUnderscores(src: [*]const u8) u16 { return @bitCast(src[0..@sizeOf(Chunk)].* == @as(@Vector(@sizeOf(Chunk), u8), @splat('_'))); } ```

Gives:

maskForUnderscores:
        stwu 1, -48(1)
        lxv 34, 0(3)
        xxspltib 35, 95
        vcmpequb 2, 2, 3
        stxv 34, 16(1)
        lbz 3, 31(1)
        lbz 4, 30(1)
        clrlwi  3, 3, 31
        rlwimi 3, 4, 1, 30, 30
        lbz 4, 29(1)
        rlwimi 3, 4, 2, 29, 29
        lbz 4, 28(1)
        rlwimi 3, 4, 3, 28, 28
        lbz 4, 27(1)
        rlwimi 3, 4, 4, 27, 27
        lbz 4, 26(1)
        rlwimi 3, 4, 5, 26, 26
        lbz 4, 25(1)
        rlwimi 3, 4, 6, 25, 25
        lbz 4, 24(1)
        rlwimi 3, 4, 7, 24, 24
        lbz 4, 23(1)
        rlwimi 3, 4, 8, 23, 23
        lbz 4, 22(1)
        rlwimi 3, 4, 9, 22, 22
        lbz 4, 21(1)
        rlwimi 3, 4, 10, 21, 21
        lbz 4, 20(1)
        rlwimi 3, 4, 11, 20, 20
        lbz 4, 19(1)
        rlwimi 3, 4, 12, 19, 19
        lbz 4, 18(1)
        rlwimi 3, 4, 13, 18, 18
        lbz 4, 17(1)
        rlwimi 3, 4, 14, 17, 17
        lbz 4, 16(1)
        rlwimi 3, 4, 15, 16, 16
        addi 1, 1, 48
        blr

Here is one way of compiling a movemask on PowerPC: https://godbolt.org/z/shhcqE7G9

bar(float __vector(4)):
.LCF0:
0:      addis 2,12,.TOC.-.LCF0@<!-- -->ha
        addi 2,2,.TOC.-.LCF0@<!-- -->l
        addis 9,2,.LC0@<!-- -->toc@<!-- -->ha
        addi 9,9,.LC0@<!-- -->toc@<!-- -->l
        lvx 0,0,9
        vbpermq 2,2,0
        mfvsrd 3,34
        extsw 3,3
        blr
        .long 0
        .byte 0,9,0,0,0,0,0,0
.LC0:
        .byte   120
        .byte   112
        .byte   104
        .byte   96
        .byte   88
        .byte   80
        .byte   72
        .byte   64
        .byte   56
        .byte   48
        .byte   40
        .byte   32
        .byte   24
        .byte   16
        .byte   8
        .byte   0

@chenzheng1030
Copy link
Collaborator

hi @Validark could you help to post the LLVM IR for the zig source? Thanks.

@Validark
Copy link
Author

Validark commented May 1, 2024

@chenzheng1030 Here you go: https://godbolt.org/z/vG1d93K37

define dso_local zeroext i16 @maskForUnderscores(ptr nocapture nonnull readonly align 1 %0) local_unnamed_addr {
Entry:
  %1 = load <16 x i8>, ptr %0, align 1
  %2 = icmp eq <16 x i8> %1, <i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95>
  %3 = bitcast <16 x i1> %2 to i16
  ret i16 %3
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants