Skip to content

Scalar matrix multiply code can't recover from <float x 2> ABI restriction #60441

@RKSimon

Description

@RKSimon

https://godbolt.org/z/19564zseG

Starting from generic C++ based vector4/matrix4 style types:

struct Vector4 {
  float x, y, z, w;
};
struct Matrix4 {
    Vector4 mCol0, mCol1, mCol2, mCol3;
};

Passing Vector4 args by value results in them being converted to { <2 x float>, <2 x float> } types for the x64 ABI, and we never manage to recover from this, even if all calls are eventually inlined away. The matrix multiply code in the compiler explorer link above results in the following IR:

define void @mulme(ptr nocapture noundef nonnull readonly align 4 dereferenceable(64) %a, ptr nocapture noundef nonnull readonly align 4 dereferenceable(64) %b, ptr nocapture noundef nonnull writeonly align 4 dereferenceable(64) %c) {
entry:
  %aa.sroa.5.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 8
  %aa.sroa.7.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 16
  %aa.sroa.9.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 24
  %aa.sroa.11.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 32
  %aa.sroa.13.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 40
  %aa.sroa.15.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 48
  %aa.sroa.17.0.a.sroa_idx = getelementptr inbounds i8, ptr %a, i64 56
  %bb.sroa.0.0.copyload = load <2 x float>, ptr %b, align 4
  %bb.sroa.4.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 8
  %bb.sroa.4.0.copyload = load <2 x float>, ptr %bb.sroa.4.0.b.sroa_idx, align 4
  %bb.sroa.5.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 16
  %bb.sroa.5.0.copyload = load <2 x float>, ptr %bb.sroa.5.0.b.sroa_idx, align 4
  %bb.sroa.6.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 24
  %bb.sroa.6.0.copyload = load <2 x float>, ptr %bb.sroa.6.0.b.sroa_idx, align 4
  %bb.sroa.7.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 32
  %bb.sroa.7.0.copyload = load <2 x float>, ptr %bb.sroa.7.0.b.sroa_idx, align 4
  %bb.sroa.8.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 40
  %bb.sroa.8.0.copyload = load <2 x float>, ptr %bb.sroa.8.0.b.sroa_idx, align 4
  %bb.sroa.9.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 48
  %bb.sroa.9.0.copyload = load <2 x float>, ptr %bb.sroa.9.0.b.sroa_idx, align 4
  %bb.sroa.10.0.b.sroa_idx = getelementptr inbounds i8, ptr %b, i64 56
  %bb.sroa.10.0.copyload = load <2 x float>, ptr %bb.sroa.10.0.b.sroa_idx, align 4
  %0 = load <2 x float>, ptr %a, align 4
  %1 = load <2 x float>, ptr %aa.sroa.7.0.a.sroa_idx, align 4
  %2 = load <2 x float>, ptr %aa.sroa.11.0.a.sroa_idx, align 4
  %3 = load <2 x float>, ptr %aa.sroa.15.0.a.sroa_idx, align 4
  %4 = shufflevector <2 x float> %bb.sroa.0.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %5 = fmul fast <2 x float> %4, %0
  %6 = shufflevector <2 x float> %bb.sroa.0.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %7 = fmul fast <2 x float> %6, %1
  %8 = fadd fast <2 x float> %7, %5
  %9 = shufflevector <2 x float> %bb.sroa.4.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %10 = fmul fast <2 x float> %9, %2
  %11 = fadd fast <2 x float> %8, %10
  %12 = shufflevector <2 x float> %bb.sroa.4.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %13 = fmul fast <2 x float> %12, %3
  %14 = fadd fast <2 x float> %11, %13
  %15 = shufflevector <2 x float> %bb.sroa.5.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %16 = fmul fast <2 x float> %15, %0
  %17 = shufflevector <2 x float> %bb.sroa.5.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %18 = fmul fast <2 x float> %17, %1
  %19 = fadd fast <2 x float> %18, %16
  %20 = shufflevector <2 x float> %bb.sroa.6.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %21 = fmul fast <2 x float> %20, %2
  %22 = fadd fast <2 x float> %19, %21
  %23 = shufflevector <2 x float> %bb.sroa.6.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %24 = fmul fast <2 x float> %23, %3
  %25 = fadd fast <2 x float> %22, %24
  %26 = shufflevector <2 x float> %bb.sroa.7.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %27 = fmul fast <2 x float> %26, %0
  %28 = shufflevector <2 x float> %bb.sroa.7.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %29 = fmul fast <2 x float> %28, %1
  %30 = fadd fast <2 x float> %29, %27
  %31 = shufflevector <2 x float> %bb.sroa.8.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %32 = fmul fast <2 x float> %31, %2
  %33 = fadd fast <2 x float> %30, %32
  %34 = shufflevector <2 x float> %bb.sroa.8.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %35 = fmul fast <2 x float> %34, %3
  %36 = fadd fast <2 x float> %33, %35
  %37 = shufflevector <2 x float> %bb.sroa.9.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %38 = fmul fast <2 x float> %37, %0
  %39 = shufflevector <2 x float> %bb.sroa.9.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %40 = fmul fast <2 x float> %39, %1
  %41 = fadd fast <2 x float> %40, %38
  %42 = shufflevector <2 x float> %bb.sroa.10.0.copyload, <2 x float> poison, <2 x i32> zeroinitializer
  %43 = fmul fast <2 x float> %42, %2
  %44 = fadd fast <2 x float> %41, %43
  %45 = shufflevector <2 x float> %bb.sroa.10.0.copyload, <2 x float> undef, <2 x i32> <i32 1, i32 1>
  %46 = fmul fast <2 x float> %45, %3
  %47 = fadd fast <2 x float> %44, %46
  %48 = load <2 x float>, ptr %aa.sroa.5.0.a.sroa_idx, align 4
  %49 = load <2 x float>, ptr %aa.sroa.9.0.a.sroa_idx, align 4
  %50 = load <2 x float>, ptr %aa.sroa.13.0.a.sroa_idx, align 4
  %51 = load <2 x float>, ptr %aa.sroa.17.0.a.sroa_idx, align 4
  %52 = fmul fast <2 x float> %4, %48
  %53 = fmul fast <2 x float> %6, %49
  %54 = fadd fast <2 x float> %53, %52
  %55 = fmul fast <2 x float> %9, %50
  %56 = fadd fast <2 x float> %54, %55
  %57 = fmul fast <2 x float> %12, %51
  %58 = fadd fast <2 x float> %56, %57
  %59 = fmul fast <2 x float> %15, %48
  %60 = fmul fast <2 x float> %17, %49
  %61 = fadd fast <2 x float> %60, %59
  %62 = fmul fast <2 x float> %20, %50
  %63 = fadd fast <2 x float> %61, %62
  %64 = fmul fast <2 x float> %23, %51
  %65 = fadd fast <2 x float> %63, %64
  %66 = fmul fast <2 x float> %26, %48
  %67 = fmul fast <2 x float> %28, %49
  %68 = fadd fast <2 x float> %67, %66
  %69 = fmul fast <2 x float> %31, %50
  %70 = fadd fast <2 x float> %68, %69
  %71 = fmul fast <2 x float> %34, %51
  %72 = fadd fast <2 x float> %70, %71
  %73 = fmul fast <2 x float> %37, %48
  %74 = fmul fast <2 x float> %39, %49
  %75 = fadd fast <2 x float> %74, %73
  %76 = fmul fast <2 x float> %42, %50
  %77 = fadd fast <2 x float> %75, %76
  %78 = fmul fast <2 x float> %45, %51
  %79 = fadd fast <2 x float> %77, %78
  store <2 x float> %14, ptr %c, align 4
  %ref.tmp.sroa.4.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 8
  store <2 x float> %58, ptr %ref.tmp.sroa.4.0.c.sroa_idx, align 4
  %ref.tmp.sroa.5.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 16
  store <2 x float> %25, ptr %ref.tmp.sroa.5.0.c.sroa_idx, align 4
  %ref.tmp.sroa.6.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 24
  store <2 x float> %65, ptr %ref.tmp.sroa.6.0.c.sroa_idx, align 4
  %ref.tmp.sroa.7.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 32
  store <2 x float> %36, ptr %ref.tmp.sroa.7.0.c.sroa_idx, align 4
  %ref.tmp.sroa.8.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 40
  store <2 x float> %72, ptr %ref.tmp.sroa.8.0.c.sroa_idx, align 4
  %ref.tmp.sroa.9.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 48
  store <2 x float> %47, ptr %ref.tmp.sroa.9.0.c.sroa_idx, align 4
  %ref.tmp.sroa.10.0.c.sroa_idx = getelementptr inbounds i8, ptr %c, i64 56
  store <2 x float> %79, ptr %ref.tmp.sroa.10.0.c.sroa_idx, align 4
  ret void
}

While GCC manages to load/store and process the vectors and matrices in whole xmm/ymm/zmm, LLVM ends up with float2 sub-vectors and relies on the DAG to load/store combine them back together as best it can.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions