137 changes: 64 additions & 73 deletions flang/test/HLFIR/maxloc-lowering.fir

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions flang/test/HLFIR/maxval-lowering.fir
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func.func @_QPmaxval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>>
// CHECK: %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
// CHECK: %[[ARG2:.*]]: !fir.ref<index>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]]
// CHECK-DAG: %[[DIM_VAR:.*]]:2 = hlfir.declare %[[ARG2]]
Expand All @@ -63,7 +64,6 @@ func.func @_QPmaxval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1
// TODO: fix alias analysis in hlfir.assign bufferization
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down Expand Up @@ -190,6 +190,7 @@ func.func @_QPmaxval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
// CHECK-LABEL: func.func @_QPmaxval6(
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>>
// CHECK: %[[ARG1:.*]]: !fir.boxchar<1>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[UNBOXED:.*]]:2 = fir.unboxchar %[[ARG1]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[UNBOXED]]#0 typeparams %[[UNBOXED]]#1
Expand All @@ -210,7 +211,6 @@ func.func @_QPmaxval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
// CHECK: %[[BOX_ELESIZE:.*]] = fir.box_elesize %[[RET]]
// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]]
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]] typeparams %[[BOX_ELESIZE]] {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.boxchar<1>, i1) -> !hlfir.expr<!fir.char<1,?>>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down
137 changes: 64 additions & 73 deletions flang/test/HLFIR/minloc-lowering.fir

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions flang/test/HLFIR/minval-lowering.fir
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func.func @_QPminval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>>
// CHECK: %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
// CHECK: %[[ARG2:.*]]: !fir.ref<index>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]]
// CHECK-DAG: %[[DIM_VAR:.*]]:2 = hlfir.declare %[[ARG2]]
Expand All @@ -63,7 +64,6 @@ func.func @_QPminval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1
// TODO: fix alias analysis in hlfir.assign bufferization
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down Expand Up @@ -151,6 +151,7 @@ func.func @_QPminval5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
}
// CHECK-LABEL: func.func @_QPminval5(
// CHECK: %[[ARG0:.*]]: !fir.ref<!fir.array<2xi32>>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap<!fir.array<?xi32>>
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
Expand All @@ -161,7 +162,6 @@ func.func @_QPminval5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
// CHECK-DAG: %[[RES_VAR:.*]] = hlfir.declare %[[ARG0]](%[[RES_SHAPE:.*]])

// CHECK-DAG: %[[MASK_ALLOC:.*]] = fir.alloca !fir.logical<4>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[MASK_VAL:.*]] = fir.convert %[[TRUE]] : (i1) -> !fir.logical<4>
// CHECK-DAG: fir.store %[[MASK_VAL]] to %[[MASK_ALLOC]] : !fir.ref<!fir.logical<4>>
// CHECK-DAG: %[[MASK_BOX:.*]] = fir.embox %[[MASK_ALLOC]]
Expand Down Expand Up @@ -190,6 +190,7 @@ func.func @_QPminval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
// CHECK-LABEL: func.func @_QPminval6(
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>>
// CHECK: %[[ARG1:.*]]: !fir.boxchar<1>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[UNBOXED:.*]]:2 = fir.unboxchar %[[ARG1]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[UNBOXED]]#0 typeparams %[[UNBOXED]]#1
Expand All @@ -210,7 +211,6 @@ func.func @_QPminval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
// CHECK: %[[BOX_ELESIZE:.*]] = fir.box_elesize %[[RET]]
// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]]
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]] typeparams %[[BOX_ELESIZE]] {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.boxchar<1>, i1) -> !hlfir.expr<!fir.char<1,?>>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down
27 changes: 11 additions & 16 deletions flang/test/HLFIR/mul_transpose.f90
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,22 @@ subroutine mul_transpose(a, b, res)
! CHECK-LOWERING: %[[TRANSPOSE_RES_LD:.*]] = fir.load %[[TRANSPOSE_RES_BOX:.*]]
! CHECK-LOWERING: %[[TRANSPOSE_RES_ADDR:.*]] = fir.box_addr %[[TRANSPOSE_RES_LD]]
! CHECK-LOWERING: %[[TRANSPOSE_RES_VAR:.*]]:2 = hlfir.declare %[[TRANSPOSE_RES_ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"}
! CHECK-LOWERING: %[[TRUE:.*]] = arith.constant true
! CHECK-LOWERING: %[[TRANSPOSE_EXPR:.*]] = hlfir.as_expr %[[TRANSPOSE_RES_VAR]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>
! CHECK-LOWERING: %[[TRANSPOSE_EXPR:.*]] = hlfir.as_expr %[[TRANSPOSE_RES_VAR]]#0 move {{.*}} : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>
! CHECK-LOWERING: %[[TRANSPOSE_ASSOC:.*]]:3 = hlfir.associate %[[TRANSPOSE_EXPR]]({{.*}}) {adapt.valuebyref}
! CHECK-LOWERING: (!hlfir.expr<?x?xf32>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>, i1)
! CHECK-LOWERING: (!hlfir.expr<?x?xf32>, !fir.shape<2>) -> (!fir.ref<!fir.array<1x2xf32>>, !fir.ref<!fir.array<1x2xf32>>, i1)

! CHECK-LOWERING: %[[LHS_BOX:.*]] = fir.embox %[[TRANSPOSE_ASSOC]]#1
! CHECK-LOWERING: %[[B_BOX:.*]] = fir.embox %[[B_DECL]]#1(%{{.*}})
! CHECK-LOWERING: %[[MUL_CONV_RES:.*]] = fir.convert %[[MUL_RES_BOX:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
! CHECK-LOWERING: %[[LHS_CONV:.*]] = fir.convert %[[LHS_BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
! CHECK-LOWERING: %[[LHS_CONV:.*]] = fir.convert %[[LHS_BOX]] : (!fir.box<!fir.array<1x2xf32>>) -> !fir.box<none>
! CHECK-LOWERING: %[[B_BOX_CONV:.*]] = fir.convert %[[B_BOX]] : (!fir.box<!fir.array<2x2xf32>>) -> !fir.box<none>
! CHECK-LOWERING: fir.call @_FortranAMatmul(%[[MUL_CONV_RES]], %[[LHS_CONV]], %[[B_BOX_CONV]], %[[LOC_STR2:.*]], %[[LOC_N2:.*]])
! CHECK-LOWERING: %[[MUL_RES_LD:.*]] = fir.load %[[MUL_RES_BOX:.*]]
! CHECK-LOWERING: %[[MUL_RES_ADDR:.*]] = fir.box_addr %[[MUL_RES_LD]]
! CHECK-LOWERING: %[[MUL_RES_VAR:.*]]:2 = hlfir.declare %[[MUL_RES_ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"}
! CHECK-LOWERING: %[[TRUE2:.*]] = arith.constant true
! CHECK-LOWERING: %[[MUL_EXPR:.*]] = hlfir.as_expr %[[MUL_RES_VAR]]#0 move %[[TRUE2]] : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>
! CHECK-LOWERING: %[[MUL_EXPR:.*]] = hlfir.as_expr %[[MUL_RES_VAR]]#0 move {{.*}} : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>

! CHECK-LOWERING: hlfir.end_associate %[[TRANSPOSE_ASSOC]]#1, %[[TRANSPOSE_ASSOC]]#2 : !fir.ref<!fir.array<?x?xf32>>, i1
! CHECK-LOWERING: hlfir.end_associate %[[TRANSPOSE_ASSOC]]#1, %[[TRANSPOSE_ASSOC]]#2 : !fir.ref<!fir.array<1x2xf32>>, i1
! CHECK-LOWERING-NEXT: hlfir.assign %[[MUL_EXPR]] to %[[RES_DECL]]#0 : !hlfir.expr<?x?xf32>, !fir.ref<!fir.array<1x2xf32>>
! CHECK-LOWERING-NEXT: hlfir.destroy %[[MUL_EXPR]]
! CHECK-LOWERING-NEXT: hlfir.destroy %[[TRANSPOSE_EXPR]]
Expand All @@ -66,8 +64,7 @@ subroutine mul_transpose(a, b, res)
! CHECK-LOWERING-OPT: %[[MUL_RES_LD:.*]] = fir.load %[[MUL_RES_BOX:.*]]
! CHECK-LOWERING-OPT: %[[MUL_RES_ADDR:.*]] = fir.box_addr %[[MUL_RES_LD]]
! CHECK-LOWERING-OPT: %[[MUL_RES_VAR:.*]]:2 = hlfir.declare %[[MUL_RES_ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"}
! CHECK-LOWERING-OPT: %[[TRUE2:.*]] = arith.constant true
! CHECK-LOWERING-OPT: %[[MUL_EXPR:.*]] = hlfir.as_expr %[[MUL_RES_VAR]]#0 move %[[TRUE2]] : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>
! CHECK-LOWERING-OPT: %[[MUL_EXPR:.*]] = hlfir.as_expr %[[MUL_RES_VAR]]#0 move {{.*}} : (!fir.box<!fir.array<?x?xf32>>, i1) -> !hlfir.expr<?x?xf32>
! CHECK-LOWERING-OPT: hlfir.assign %[[MUL_EXPR]] to %[[RES_DECL]]#0 : !hlfir.expr<?x?xf32>, !fir.ref<!fir.array<1x2xf32>>
! CHECK-LOWERING-OPT: hlfir.destroy %[[MUL_EXPR]]

Expand All @@ -76,25 +73,23 @@ subroutine mul_transpose(a, b, res)
! CHECK-BUFFERING: %[[TRANSPOSE_RES_LD:.*]] = fir.load %[[TRANSPOSE_RES_BOX:.*]]
! CHECK-BUFFERING: %[[TRANSPOSE_RES_ADDR:.*]] = fir.box_addr %[[TRANSPOSE_RES_LD]]
! CHECK-BUFFERING: %[[TRANSPOSE_RES_VAR:.*]]:2 = hlfir.declare %[[TRANSPOSE_RES_ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"}
! CHECK-BUFFERING: %[[TRUE:.*]] = arith.constant true
! CHECK-BUFFERING: %[[TUPLE0:.*]] = fir.undefined tuple<!fir.box<!fir.array<?x?xf32>>, i1>
! CHECK-BUFFERING: %[[TUPLE1:.*]] = fir.insert_value %[[TUPLE0]], %[[TRUE]], [1 : index]
! CHECK-BUFFERING: %[[TUPLE1:.*]] = fir.insert_value %[[TUPLE0]], {{.*}}, [1 : index]
! CHECK-BUFFERING: %[[TUPLE2:.*]] = fir.insert_value %[[TUPLE1]], %[[TRANSPOSE_RES_VAR]]#0, [0 : index]

! CHECK-BUFFERING: %[[TRANSPOSE_RES_REF:.*]] = fir.convert %[[TRANSPOSE_RES_VAR]]#1 : (!fir.heap<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
! CHECK-BUFFERING: %[[TRANSPOSE_RES_REF:.*]] = fir.convert %[[TRANSPOSE_RES_VAR]]#1 : (!fir.heap<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<1x2xf32>>
! CHECK-BUFFERING: %[[TRANSPOSE_RES_BOX:.*]] = fir.embox %[[TRANSPOSE_RES_REF]]({{.*}})
! CHECK-BUFFERING: %[[LHS_CONV:.*]] = fir.convert %[[TRANSPOSE_RES_BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
! CHECK-BUFFERING: %[[LHS_CONV:.*]] = fir.convert %[[TRANSPOSE_RES_BOX]] : (!fir.box<!fir.array<1x2xf32>>) -> !fir.box<none>
! [argument handling unchanged]
! CHECK-BUFFERING: fir.call @_FortranAMatmul(
! CHECK-BUFFERING: %[[MUL_RES_LD:.*]] = fir.load %[[MUL_RES_BOX:.*]]
! CHECK-BUFFERING: %[[MUL_RES_ADDR:.*]] = fir.box_addr %[[MUL_RES_LD]]
! CHECK-BUFFERING: %[[MUL_RES_VAR:.*]]:2 = hlfir.declare %[[MUL_RES_ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"}
! CHECK-BUFFERING: %[[TRUE2:.*]] = arith.constant true
! CHECK-BUFFERING: %[[TUPLE3:.*]] = fir.undefined tuple<!fir.box<!fir.array<?x?xf32>>, i1>
! CHECK-BUFFERING: %[[TUPLE4:.*]] = fir.insert_value %[[TUPLE3]], %[[TRUE2]], [1 : index]
! CHECK-BUFFERING: %[[TUPLE4:.*]] = fir.insert_value %[[TUPLE3]], {{.*}}, [1 : index]
! CHECK-BUFFERING: %[[TUPLE5:.*]] = fir.insert_value %[[TUPLE4]], %[[MUL_RES_VAR]]#0, [0 : index]

! CHECK-BUFFERING: %[[TRANSPOSE_RES_HEAP:.*]] = fir.convert %[[TRANSPOSE_RES_REF]] : (!fir.ref<!fir.array<?x?xf32>>) -> !fir.heap<!fir.array<?x?xf32>>
! CHECK-BUFFERING: %[[TRANSPOSE_RES_HEAP:.*]] = fir.convert %[[TRANSPOSE_RES_REF]] : (!fir.ref<!fir.array<1x2xf32>>) -> !fir.heap<!fir.array<1x2xf32>>
! CHECK-BUFFERING-NEXT: fir.freemem %[[TRANSPOSE_RES_HEAP]]
! CHECK-BUFFERING-NEXT: hlfir.assign %[[MUL_RES_VAR]]#0 to %[[RES_DECL]]#0 : !fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<1x2xf32>>
! CHECK-BUFFERING-NEXT: %[[MUL_RES_HEAP:.*]] = fir.box_addr %[[MUL_RES_VAR]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.heap<!fir.array<?x?xf32>>
Expand Down
4 changes: 2 additions & 2 deletions flang/test/HLFIR/product-lowering.fir
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func.func @_QPproduct2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>>
// CHECK: %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
// CHECK: %[[ARG2:.*]]: !fir.ref<index>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[DIM_VAR:.*]]:2 = hlfir.declare %[[ARG2]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]]
Expand All @@ -64,7 +65,6 @@ func.func @_QPproduct2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a
// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]]
// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[EXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
// CHECK: hlfir.assign %[[EXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[EXPR]]
Expand Down Expand Up @@ -141,6 +141,7 @@ func.func @_QPproduct5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}

// CHECK-LABEL: func.func @_QPproduct5(
// CHECK: %[[ARG0:.*]]: !fir.ref<!fir.array<2xi32>>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap<!fir.array<?xi32>>
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
Expand All @@ -151,7 +152,6 @@ func.func @_QPproduct5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}
// CHECK-DAG: %[[RES_VAR:.*]] = hlfir.declare %[[ARG0]](%[[RES_SHAPE:.*]])

// CHECK-DAG: %[[MASK_ALLOC:.*]] = fir.alloca !fir.logical<4>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[MASK_VAL:.*]] = fir.convert %[[TRUE]] : (i1) -> !fir.logical<4>
// CHECK-DAG: fir.store %[[MASK_VAL]] to %[[MASK_ALLOC]] : !fir.ref<!fir.logical<4>>
// CHECK-DAG: %[[MASK_BOX:.*]] = fir.embox %[[MASK_ALLOC]]
Expand Down
4 changes: 2 additions & 2 deletions flang/test/HLFIR/sum-lowering.fir
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func.func @_QPsum2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
// CHECK: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>>
// CHECK: %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
// CHECK: %[[ARG2:.*]]: !fir.ref<index>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]]
// CHECK-DAG: %[[DIM_VAR:.*]]:2 = hlfir.declare %[[ARG2]]
Expand All @@ -63,7 +64,6 @@ func.func @_QPsum2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1
// TODO: fix alias analysis in hlfir.assign bufferization
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down Expand Up @@ -151,6 +151,7 @@ func.func @_QPsum5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}) {
}
// CHECK-LABEL: func.func @_QPsum5(
// CHECK: %[[ARG0:.*]]: !fir.ref<!fir.array<2xi32>>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap<!fir.array<?xi32>>
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
Expand All @@ -161,7 +162,6 @@ func.func @_QPsum5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}) {
// CHECK-DAG: %[[RES_VAR:.*]] = hlfir.declare %[[ARG0]](%[[RES_SHAPE:.*]])

// CHECK-DAG: %[[MASK_ALLOC:.*]] = fir.alloca !fir.logical<4>
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[MASK_VAL:.*]] = fir.convert %[[TRUE]] : (i1) -> !fir.logical<4>
// CHECK-DAG: fir.store %[[MASK_VAL]] to %[[MASK_ALLOC]] : !fir.ref<!fir.logical<4>>
// CHECK-DAG: %[[MASK_BOX:.*]] = fir.embox %[[MASK_ALLOC]]
Expand Down
2 changes: 1 addition & 1 deletion flang/test/HLFIR/transpose-lowering.fir
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func.func @_QPtranspose1(%arg0: !fir.ref<!fir.array<1x2xi32>> {fir.bindc_name =
// CHECK-LABEL: func.func @_QPtranspose1(
// CHECK: %[[ARG0:.*]]: !fir.ref<!fir.array<1x2xi32>> {fir.bindc_name = "m"}
// CHECK: %[[ARG1:.*]]: !fir.ref<!fir.array<2x1xi32>> {fir.bindc_name = "res"}
// CHECK-DAG: %[[TRUE:.*]] = arith.constant true
// CHECK-DAG: %[[M_VAR:.*]]:2 = hlfir.declare %[[ARG0]]
// CHECK-DAG: %[[RES_VAR:.*]]:2 = hlfir.declare %[[ARG1]]

Expand All @@ -40,7 +41,6 @@ func.func @_QPtranspose1(%arg0: !fir.ref<!fir.array<1x2xi32>> {fir.bindc_name =
// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1
// TODO: fix alias analysis in hlfir.assign bufferization
// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"}
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[ASEXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box<!fir.array<?x?xi32>>, i1) -> !hlfir.expr<?x?xi32>
// CHECK: hlfir.assign %[[ASEXPR]] to %[[RES_VAR]]#0
// CHECK: hlfir.destroy %[[ASEXPR]]
Expand Down
2 changes: 1 addition & 1 deletion flang/test/Lower/convert.f90
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ program test
! ALL: fir.has_value %[[VAL_0]] : !fir.char<1,[[OPT_STR_LEN]]>

! ALL: fir.global linkonce @_QQEnvironmentDefaults.list constant : tuple<i[[int_size:.*]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>> {
! ALL: %[[VAL_0:.*]] = fir.undefined tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
! ALL: %[[VAL_1:.*]] = arith.constant 1 : i[[int_size]]
! ALL: %[[VAL_0:.*]] = fir.undefined tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
! ALL: %[[VAL_2:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_1]], [0 : index] : (tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>, i[[int_size]]) -> tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
! ALL: %[[VAL_3:.*]] = fir.address_of(@_QQEnvironmentDefaults.items) : !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>
! ALL: %[[VAL_4:.*]] = fir.insert_value %[[VAL_2]], %[[VAL_3]], [1 : index] : (tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>) -> tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
Expand Down
57 changes: 57 additions & 0 deletions flang/test/Parser/assume-aligned.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
! RUN: %flang_fc1 -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s

SUBROUTINE aa(a, nn)
IMPLICIT NONE
INTEGER, INTENT(IN) :: nn
COMPLEX(8), INTENT(INOUT), DIMENSION(1:nn) :: a
INTEGER :: i
!DIR$ assume_aligned a:16
!CHECK: !DIR$ ASSUME_ALIGNED a:16
!DIR$ assume_aligned a (1):16
!CHECK: !DIR$ ASSUME_ALIGNED a(1):16
!DIR$ assume_aligned a(1):16
!CHECK: !DIR$ ASSUME_ALIGNED a(1):16
!DIR$ assume_aligned a(nn):16
!CHECK: !DIR$ ASSUME_ALIGNED a(nn):16
!DIR$ assume_aligned a(44):16
!CHECK: !DIR$ ASSUME_ALIGNED a(44):16
DO i=1,nn
a(i)=a(i)+1.5
END DO
END SUBROUTINE aa

SUBROUTINE bb(v, s, e)
IMPLICIT NONE
INTEGER, INTENT(IN) :: s(3), e(3)
INTEGER :: y,z
REAL(8), INTENT(IN) :: v(s(1):e(1),s(2):e(2),s(3):e(3))
!DIR$ assume_aligned v(s(1),y,z) :64
!CHECK: !DIR$ ASSUME_ALIGNED v(s(1),y,z):64
END SUBROUTINE bb

SUBROUTINE f(n)
IMPLICIT NONE
TYPE node
REAL(KIND=8), POINTER :: a(:,:)
END TYPE NODE

TYPE(NODE), POINTER :: nodes
INTEGER :: i
INTEGER, INTENT(IN) :: n

ALLOCATE(nodes)
ALLOCATE(nodes%a(1000,1000))

!DIR$ ASSUME_ALIGNED nodes%a(1,1) : 16
!CHECK: !DIR$ ASSUME_ALIGNED nodes%a(1,1):16
DO i=1,n
nodes%a(1,i) = nodes%a(1,i)+1
END DO
END SUBROUTINE f

SUBROUTINE g(a, b)
IMPLICIT NONE
INTEGER, INTENT(in) :: a(128), b(128)
!DIR$ ASSUME_ALIGNED a:32, b:64
!CHECK: !DIR$ ASSUME_ALIGNED a:32, b:64
END SUBROUTINE g
9 changes: 5 additions & 4 deletions libc/cmake/modules/LLVMLibCArchitectures.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,14 @@ if(LIBC_TARGET_OS STREQUAL "baremetal")
set(LIBC_TARGET_OS_IS_BAREMETAL TRUE)
elseif(LIBC_TARGET_OS STREQUAL "linux")
set(LIBC_TARGET_OS_IS_LINUX TRUE)
elseif(LIBC_TARGET_OS STREQUAL "poky" OR LIBC_TARGET_OS STREQUAL "suse")
# poky are ustom Linux-base systems created by yocto. Since these are Linux
elseif(LIBC_TARGET_OS STREQUAL "poky" OR LIBC_TARGET_OS STREQUAL "suse" OR
LIBC_TARGET_OS STREQUAL "redhat")
# poky are custom Linux-base systems created by yocto. Since these are Linux
# images, we change the LIBC_TARGET_OS to linux. This define is used to
# include the right directories during compilation.
#
# openSUSE uses different triple format which causes LIBC_TARGET_OS to be
# computed as "suse" instead of "linux".
# openSUSE and redhat use different triple format which causes LIBC_TARGET_OS
# to be computed as "suse" or "redhat" instead of "linux".
set(LIBC_TARGET_OS_IS_LINUX TRUE)
set(LIBC_TARGET_OS "linux")
elseif(LIBC_TARGET_OS STREQUAL "darwin")
Expand Down
1 change: 1 addition & 0 deletions libc/include/llvm-libc-macros/float-macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wgnu-include-next"
#pragma clang diagnostic ignored "-Winclude-next-absolute-path"
#else // gcc
#pragma GCC system_header
#endif //__clang__
Expand Down
3 changes: 3 additions & 0 deletions lldb/include/lldb/Core/Progress.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ class Progress {
///
/// @param [in] title The title of this progress activity.
///
/// @param [in] details Specific information about what the progress report
/// is currently working on.
///
/// @param [in] total The total units of work to be done if specified, if
/// set to std::nullopt then an indeterminate progress indicator should be
/// displayed.
Expand Down
1 change: 0 additions & 1 deletion lldb/test/API/lang/c/local_variables/TestLocalVariables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def setUp(self):
self.source = "main.c"
self.line = line_number(self.source, "// Set break point at this line.")

@skipIfWindows
def test_c_local_variables(self):
"""Test local variable value."""
self.build()
Expand Down
12 changes: 8 additions & 4 deletions lldb/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ endif()

if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS)
message(STATUS "Enforcing strict test requirements for LLDB")
set(useful_python_modules
psutil # Lit uses psutil to do per-test timeouts.
pexpect # We no longer vendor pexpect.
)
# Lit uses psutil to do per-test timeouts.
set(useful_python_modules psutil)

if(NOT WIN32)
# We no longer vendor pexpect and it is not used on Windows.
list(APPEND pexpect)
endif()

foreach(module ${useful_python_modules})
lldb_find_python_module(${module})
if (NOT PY_${module}_FOUND)
Expand Down
319 changes: 159 additions & 160 deletions lldb/tools/debugserver/source/RNBRemote.cpp

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions lldb/tools/debugserver/source/RNBRemote.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum class compression_types { zlib_deflate, lz4, lzma, lzfse, none };

class RNBRemote {
public:
// clang-format off
enum PacketEnum {
invalid_packet = 0,
ack, // '+'
Expand Down Expand Up @@ -137,8 +138,10 @@ class RNBRemote {
set_detach_on_error, // 'QSetDetachOnError:'
query_transfer, // 'qXfer:'
json_query_dyld_process_state, // 'jGetDyldProcessState'
enable_error_strings, // 'QEnableErrorStrings'
unknown_type
};
// clang-format on

typedef rnb_err_t (RNBRemote::*HandlePacketCallback)(const char *p);

Expand Down Expand Up @@ -196,6 +199,7 @@ class RNBRemote {
rnb_err_t HandlePacket_qGDBServerVersion(const char *p);
rnb_err_t HandlePacket_qProcessInfo(const char *p);
rnb_err_t HandlePacket_qSymbol(const char *p);
rnb_err_t HandlePacket_QEnableErrorStrings(const char *p);
rnb_err_t HandlePacket_QStartNoAckMode(const char *p);
rnb_err_t HandlePacket_QThreadSuffixSupported(const char *p);
rnb_err_t HandlePacket_QSetLogging(const char *p);
Expand Down Expand Up @@ -356,6 +360,8 @@ class RNBRemote {
rnb_err_t GetPacket(std::string &packet_data, RNBRemote::Packet &packet_info,
bool wait);
rnb_err_t SendPacket(const std::string &);
rnb_err_t SendErrorPacket(std::string errcode,
const std::string &errmsg = "");
std::string CompressString(const std::string &);

void CreatePacketTable();
Expand Down Expand Up @@ -405,6 +411,9 @@ class RNBRemote {
bool m_enable_compression_next_send_packet;

compression_types m_compression_mode;

bool m_enable_error_strings; // Whether we can append asciihex error strings
// after Exx error replies
};

/* We translate the /usr/include/mach/exception_types.h exception types
Expand Down
14 changes: 10 additions & 4 deletions llvm/docs/CommandGuide/llvm-readobj.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ file formats.
Dump decompressed section content when used with ``-x`` or ``-p``.
If the section(s) are not compressed, they are displayed as is.

.. option:: --demangle, -C

Display demangled symbol names in the output. This option is only for ELF and
XCOFF file formats.

.. option:: --expand-relocs

When used with :option:`--relocs`, display each relocation in an expanded
Expand Down Expand Up @@ -94,6 +99,11 @@ file formats.

Display the needed libraries.

.. option:: --no-demangle

Do not demangle symbol names in the output. This option is only for ELF and
XCOFF file formats. The option is enabled by default.

.. option:: --relocations, --relocs, -r

Display the relocation entries in the file.
Expand Down Expand Up @@ -175,10 +185,6 @@ The following options are implemented only for the ELF file format.

Requires :option:`--bb-addr-map` to have an effect.

.. option:: --demangle, -C

Display demangled symbol names in the output.

.. option:: --dependent-libraries

Display the dependent libraries section.
Expand Down
12 changes: 0 additions & 12 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -908,18 +908,6 @@ class LegalizeRuleSet {
LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
}

/// Widen the scalar or vector element type to the next power of two that is
/// at least MinSize. No effect if the scalar size is a power of two.
LegalizeRuleSet &widenScalarOrEltToNextPow2OrMinSize(unsigned TypeIdx,
unsigned MinSize = 0) {
using namespace LegalityPredicates;
return actionIf(
LegalizeAction::WidenScalar,
any(scalarOrEltNarrowerThan(TypeIdx, MinSize),
scalarOrEltSizeNotPow2(typeIdx(TypeIdx))),
LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
}

LegalizeRuleSet &narrowScalar(unsigned TypeIdx, LegalizeMutation Mutation) {
using namespace LegalityPredicates;
return actionIf(LegalizeAction::NarrowScalar, isScalar(typeIdx(TypeIdx)),
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/Intrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ def llvm_v1i32_ty : LLVMType<v1i32>; // 1 x i32
def llvm_v2i32_ty : LLVMType<v2i32>; // 2 x i32
def llvm_v3i32_ty : LLVMType<v3i32>; // 3 x i32
def llvm_v4i32_ty : LLVMType<v4i32>; // 4 x i32
def llvm_v6i32_ty : LLVMType<v6i32>; // 6 x i32
def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3196,4 +3196,11 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
[IntrNoMem, IntrSpeculatable]
>;

/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
[llvm_anyptr_ty], [llvm_anyptr_ty],
[IntrNoMem, IntrSpeculatable]
>;
}
23 changes: 8 additions & 15 deletions llvm/include/llvm/IR/Module.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,30 +385,23 @@ class LLVM_EXTERNAL_VISIBILITY Module {
/// @name Function Accessors
/// @{

/// Look up the specified function in the module symbol table. Four
/// possibilities:
/// 1. If it does not exist, add a prototype for the function and return it.
/// 2. Otherwise, if the existing function has the correct prototype, return
/// the existing function.
/// 3. Finally, the function exists but has the wrong prototype: return the
/// function with a constantexpr cast to the right prototype.
/// Look up the specified function in the module symbol table. If it does not
/// exist, add a prototype for the function and return it. Otherwise, return
/// the existing function.
///
/// In all cases, the returned value is a FunctionCallee wrapper around the
/// 'FunctionType *T' passed in, as well as a 'Value*' either of the Function or
/// the bitcast to the function.
/// 'FunctionType *T' passed in, as well as the 'Value*' of the Function. The
/// function type of the function may differ from the function type stored in
/// FunctionCallee if it was previously created with a different type.
///
/// Note: For library calls getOrInsertLibFunc() should be used instead.
FunctionCallee getOrInsertFunction(StringRef Name, FunctionType *T,
AttributeList AttributeList);

FunctionCallee getOrInsertFunction(StringRef Name, FunctionType *T);

/// Look up the specified function in the module symbol table. If it does not
/// exist, add a prototype for the function and return it. This function
/// guarantees to return a constant of pointer to the specified function type
/// or a ConstantExpr BitCast of that type if the named function has a
/// different type. This version of the method takes a list of
/// function arguments, which makes it easier for clients to use.
/// Same as above, but takes a list of function arguments, which makes it
/// easier for clients to use.
template <typename... ArgsTy>
FunctionCallee getOrInsertFunction(StringRef Name,
AttributeList AttributeList, Type *RetTy,
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/MC/MCExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ class MCSymbolRefExpr : public MCExpr {
VK_PPC_AIX_TLSGDM, // symbol@m
VK_PPC_AIX_TLSIE, // symbol@ie
VK_PPC_AIX_TLSLE, // symbol@le
VK_PPC_AIX_TLSLD, // symbol@ld
VK_PPC_AIX_TLSML, // symbol@ml
VK_PPC_GOT_TLSLD, // symbol@got@tlsld
VK_PPC_GOT_TLSLD_LO, // symbol@got@tlsld@l
VK_PPC_GOT_TLSLD_HI, // symbol@got@tlsld@h
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/TargetParser/RISCVTargetParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ namespace RISCV {
// We use 64 bits as the known part in the scalable vector types.
static constexpr unsigned RVVBitsPerBlock = 64;

void getFeaturesForCPU(StringRef CPU,
SmallVectorImpl<std::string> &EnabledFeatures,
bool NeedPlus = false);
bool parseCPU(StringRef CPU, bool IsRV64);
bool parseTuneCPU(StringRef CPU, bool IsRV64);
StringRef getMArchFromMcpu(StringRef CPU);
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/TextAPI/Record.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ class ObjCCategoryRecord : public ObjCContainerRecord {
: ObjCContainerRecord(Name, RecordLinkage::Unknown),
ClassToExtend(ClassToExtend) {}

StringRef getSuperClassName() const { return ClassToExtend; }

private:
StringRef ClassToExtend;
};
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2495,7 +2495,6 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
case TargetOpcode::G_SUB:
case TargetOpcode::G_SHUFFLE_VECTOR:
// Perform operation at larger width (any extension is fines here, high bits
// don't affect the result) and then truncate the result back to the
// original type.
Expand Down
23 changes: 18 additions & 5 deletions llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2418,6 +2418,15 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
SmallString<128> Name;
getNameWithPrefix(Name, GO, TM);

// AIX TLS local-dynamic does not need the external reference for the
// "_$TLSML" symbol.
if (GO->getThreadLocalMode() == GlobalVariable::LocalDynamicTLSModel &&
GO->hasName() && GO->getName() == "_$TLSML") {
return getContext().getXCOFFSection(
Name, SectionKind::getData(),
XCOFF::CsectProperties(XCOFF::XMC_TC, XCOFF::XTY_SD));
}

XCOFF::StorageMappingClass SMC =
isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA;
if (GO->isThreadLocal())
Expand Down Expand Up @@ -2675,13 +2684,17 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
// the chance of needing -bbigtoc is decreased. Also, the toc-entry for
// EH info is never referenced directly using instructions so it can be
// allocated with TE storage-mapping class.
// The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC, otherwise
// the AIX assembler will complain.
return getContext().getXCOFFSection(
cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), SectionKind::getData(),
XCOFF::CsectProperties((TM.getCodeModel() == CodeModel::Large ||
cast<MCSymbolXCOFF>(Sym)->isEHInfo())
? XCOFF::XMC_TE
: XCOFF::XMC_TC,
XCOFF::XTY_SD));
XCOFF::CsectProperties(
((TM.getCodeModel() == CodeModel::Large &&
cast<MCSymbolXCOFF>(Sym)->getSymbolTableName() != "_$TLSML") ||
cast<MCSymbolXCOFF>(Sym)->isEHInfo())
? XCOFF::XMC_TE
: XCOFF::XMC_TC,
XCOFF::XTY_SD));
}

MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/MC/MCExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,10 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
return "ie";
case VK_PPC_AIX_TLSLE:
return "le";
case VK_PPC_AIX_TLSLD:
return "ld";
case VK_PPC_AIX_TLSML:
return "ml";
case VK_PPC_GOT_TLSLD: return "got@tlsld";
case VK_PPC_GOT_TLSLD_LO: return "got@tlsld@l";
case VK_PPC_GOT_TLSLD_HI: return "got@tlsld@h";
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/MC/XCOFFObjectWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,8 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
if (Type == XCOFF::RelocationType::R_POS ||
Type == XCOFF::RelocationType::R_TLS ||
Type == XCOFF::RelocationType::R_TLS_LE ||
Type == XCOFF::RelocationType::R_TLS_IE)
Type == XCOFF::RelocationType::R_TLS_IE ||
Type == XCOFF::RelocationType::R_TLS_LD)
// The FixedValue should be symbol's virtual address in this object file
// plus any constant value that we might get.
FixedValue = getVirtualAddress(SymA, SymASec) + Target.getConstant();
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2547,7 +2547,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FSUB_PRED)
MAKE_CASE(AArch64ISD::RDSVL)
MAKE_CASE(AArch64ISD::BIC)
MAKE_CASE(AArch64ISD::BIT)
MAKE_CASE(AArch64ISD::CBZ)
MAKE_CASE(AArch64ISD::CBNZ)
MAKE_CASE(AArch64ISD::TBZ)
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,6 @@ enum NodeType : unsigned {
EORV_PRED,
ANDV_PRED,

// Vector bitwise insertion
BIT,

// Compare-and-branch
CBZ,
CBNZ,
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,6 @@ def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;

def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;

def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
Expand Down Expand Up @@ -5333,7 +5332,7 @@ defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS),
(and (vnot node:$LHS), node:$RHS))>>;
defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">;
defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit">;
defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">;

def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
Expand Down Expand Up @@ -8216,8 +8215,10 @@ defm ST4 : SIMDLdSt4SingleAliases<"st4">;
//----------------------------------------------------------------------------

let Predicates = [HasAES] in {
let isCommutable = 1 in {
def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
}
def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
}
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -956,9 +956,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
},
changeTo(1, 0))
.moreElementsToNextPow2(0)
.widenScalarOrEltToNextPow2OrMinSize(0, 8)
.clampNumElements(0, v8s8, v16s8)
.clampNumElements(0, v4s16, v8s16)
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
.moreElementsIf(
Expand Down Expand Up @@ -1009,6 +1006,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
ABSActions
.legalFor({s32, s64});
ABSActions.legalFor(PackedVectorAllTypeList)
.widenScalarIf(
[=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
[=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
.widenScalarIf(
[=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
[=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
.clampNumElements(0, v8s8, v16s8)
.clampNumElements(0, v4s16, v8s16)
.clampNumElements(0, v2s32, v4s32)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def RetCC_SI_Gfx : CallingConv<[

def CC_SI_SHADER : CallingConv<[

CCIfType<[i1], CCPromoteToType<i32>>,

CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
Expand Down
73 changes: 73 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
const GCNSubtarget *ST = nullptr;
const AMDGPUTargetMachine *TM = nullptr;
const TargetLibraryInfo *TLInfo = nullptr;
AssumptionCache *AC = nullptr;
DominatorTree *DT = nullptr;
Expand Down Expand Up @@ -310,6 +311,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);

bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
Expand Down Expand Up @@ -2013,6 +2015,75 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
return true;
}

/// \param V Value to check
/// \param DL DataLayout
/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
/// \param AS Target Address Space
/// \return true if \p V cannot be the null value of \p AS, false otherwise.
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
const AMDGPUTargetMachine &TM, unsigned AS) {
// Pointer cannot be null if it's a block address, GV or alloca.
// NOTE: We don't support extern_weak, but if we did, we'd need to check for
// it as the symbol could be null in such cases.
if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
return true;

// Check nonnull arguments.
if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
return true;

// TODO: Calls that return nonnull?

// For all other things, use KnownBits.
// We either use 0 or all bits set to indicate null, so check whether the
// value can be zero or all ones.
//
// TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
// address spaces have non-zero null values.
auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS));
const auto NullVal = TM.getNullPointerValue(AS);
assert((NullVal == 0 || NullVal == -1) &&
"don't know how to check for this null value!");
return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
}

bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
// Intrinsic doesn't support vectors, also it seems that it's often difficult
// to prove that a vector cannot have any nulls in it so it's unclear if it's
// worth supporting.
if (I.getType()->isVectorTy())
return false;

// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
// This is only worthwhile for casts from/to priv/local to flat.
const unsigned SrcAS = I.getSrcAddressSpace();
const unsigned DstAS = I.getDestAddressSpace();

bool CanLower = false;
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
DstAS == AMDGPUAS::PRIVATE_ADDRESS);
else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
if (!CanLower)
return false;

SmallVector<const Value *, 4> WorkList;
getUnderlyingObjects(I.getOperand(0), WorkList);
if (!all_of(WorkList, [&](const Value *V) {
return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
}))
return false;

IRBuilder<> B(&I);
auto *Intrin = B.CreateIntrinsic(
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
I.replaceAllUsesWith(Intrin);
I.eraseFromParent();
return true;
}

bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::bitreverse:
Expand Down Expand Up @@ -2196,6 +2267,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
return false;

const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
Impl.TM = &TM;
Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
Expand All @@ -2214,6 +2286,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
AMDGPUCodeGenPrepareImpl Impl;
Impl.Mod = F.getParent();
Impl.DL = &Impl.Mod->getDataLayout();
Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
Expand Down
20 changes: 16 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2247,10 +2247,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();

// MI can either be a G_ADDRSPACE_CAST or a
// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
Intrinsic::amdgcn_addrspacecast_nonnull));

const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();

Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
: MI.getOperand(1).getReg();
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);
unsigned DestAS = DstTy.getAddressSpace();
Expand All @@ -2271,7 +2277,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
// Extract low 32-bits of the pointer.
B.buildExtract(Dst, Src, 0);
MI.eraseFromParent();
Expand Down Expand Up @@ -2308,7 +2316,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// avoid the ptrtoint?
auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});

if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
B.buildCopy(Dst, BuildPtr);
MI.eraseFromParent();
return true;
Expand Down Expand Up @@ -7020,6 +7030,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,

return false;
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return legalizeAddrSpaceCast(MI, MRI, B);
case Intrinsic::amdgcn_make_buffer_rsrc:
return legalizePointerAsRsrcIntrin(MI, MRI, B);
case Intrinsic::amdgcn_kernarg_segment_ptr:
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2292,7 +2292,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
MachineOperand::CreateReg(Reg, false, false, true);
ArrayRef<MachineOperand> Cond(RegOp);
LLVM_DEBUG(dbgs() << "RegionExitReg: ");
LLVM_DEBUG(Cond[0].print(dbgs(), TRI));
LLVM_DEBUG(RegOp.print(dbgs(), TRI));
LLVM_DEBUG(dbgs() << "\n");
TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
Cond, DebugLoc());
Expand Down
152 changes: 76 additions & 76 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td

Large diffs are not rendered by default.

57 changes: 44 additions & 13 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
}

void SITargetLowering::CollectTargetIntrinsicOperands(
const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
case Intrinsic::amdgcn_addrspacecast_nonnull: {
// The DAG's ValueType loses the addrspaces.
// Add them as 2 extra Constant operands "from" and "to".
unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
unsigned DstAS = I.getType()->getPointerAddressSpace();
Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
break;
}
default:
break;
}
}

bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
Expand Down Expand Up @@ -6635,24 +6652,36 @@ static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);

SDValue Src = ASC->getOperand(0);
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
unsigned SrcAS = ASC->getSrcAddressSpace();

const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());

unsigned DestAS, SrcAS;
SDValue Src;
bool IsNonNull = false;
if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
SrcAS = ASC->getSrcAddressSpace();
Src = ASC->getOperand(0);
DestAS = ASC->getDestAddressSpace();
} else {
assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
Op.getConstantOperandVal(0) ==
Intrinsic::amdgcn_addrspacecast_nonnull);
Src = Op->getOperand(1);
SrcAS = Op->getConstantOperandVal(2);
DestAS = Op->getConstantOperandVal(3);
IsNonNull = true;
}

SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);

// flat -> local/private
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
unsigned DestAS = ASC->getDestAddressSpace();

if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);

if (isKnownNonNull(Src, DAG, TM, SrcAS))
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;

unsigned NullVal = TM.getNullPointerValue(DestAS);
Expand All @@ -6665,16 +6694,16 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}

// local/private -> flat
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {

SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
SDValue CvtPtr =
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);

if (isKnownNonNull(Src, DAG, TM, SrcAS))
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;

unsigned NullVal = TM.getNullPointerValue(SrcAS);
Expand All @@ -6697,7 +6726,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
}

if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Src.getValueType() == MVT::i64)
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);

Expand All @@ -6708,7 +6737,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);

return DAG.getUNDEF(ASC->getValueType(0));
return DAG.getUNDEF(Op->getValueType(0));
}

// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
Expand Down Expand Up @@ -8325,6 +8354,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
IndexKeyi32, Op.getOperand(7)});
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineFunction &MF,
unsigned IntrinsicID) const override;

void CollectTargetIntrinsicOperands(const CallInst &I,
SmallVectorImpl<SDValue> &Ops,
SelectionDAG &DAG) const override;

bool getAddrModeArguments(IntrinsicInst * /*I*/,
SmallVectorImpl<Value*> &/*Ops*/,
Type *&/*AccessTy*/) const override;
Expand Down
27 changes: 14 additions & 13 deletions llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,9 +506,10 @@ class ARMAsmParser : public MCTargetAsmParser {

bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken);
StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
unsigned &PredicationCode,
unsigned &VPTPredicationCode, bool &CarrySetting,
unsigned &ProcessorIMod, StringRef &ITMask);
ARMCC::CondCodes &PredicationCode,
ARMVCC::VPTCodes &VPTPredicationCode,
bool &CarrySetting, unsigned &ProcessorIMod,
StringRef &ITMask);
void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef ExtraToken,
StringRef FullInst, bool &CanAcceptCarrySet,
bool &CanAcceptPredicationCode,
Expand Down Expand Up @@ -6283,10 +6284,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
//
// FIXME: Would be nice to autogen this.
// FIXME: This is a bit of a maze of special cases.
StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
StringRef ExtraToken,
unsigned &PredicationCode,
unsigned &VPTPredicationCode,
StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
ARMCC::CondCodes &PredicationCode,
ARMVCC::VPTCodes &VPTPredicationCode,
bool &CarrySetting,
unsigned &ProcessorIMod,
StringRef &ITMask) {
Expand Down Expand Up @@ -6340,7 +6340,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
unsigned CC = ARMCondCodeFromString(Mnemonic.substr(Mnemonic.size()-2));
if (CC != ~0U) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
PredicationCode = CC;
PredicationCode = static_cast<ARMCC::CondCodes>(CC);
}
}

Expand Down Expand Up @@ -6384,10 +6384,11 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" &&
Mnemonic != "vqmovnt" && Mnemonic != "vmovnt" && Mnemonic != "vqdmullt" &&
Mnemonic != "vpnot" && Mnemonic != "vcvtt" && Mnemonic != "vcvt") {
unsigned CC = ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size()-1));
if (CC != ~0U) {
unsigned VCC =
ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size() - 1));
if (VCC != ~0U) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size()-1);
VPTPredicationCode = CC;
VPTPredicationCode = static_cast<ARMVCC::VPTCodes>(VCC);
}
return Mnemonic;
}
Expand Down Expand Up @@ -6966,8 +6967,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
StringRef ExtraToken = Name.slice(Next, Name.find(' ', Next + 1));

// Split out the predication code and carry setting flag from the mnemonic.
unsigned PredicationCode;
unsigned VPTPredicationCode;
ARMCC::CondCodes PredicationCode;
ARMVCC::VPTCodes VPTPredicationCode;
unsigned ProcessorIMod;
bool CarrySetting;
StringRef ITMask;
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,19 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
MCSymbolXCOFF *TCSym =
cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
->getQualNameSymbol();
// On AIX, we have a region handle (symbol@m) and the variable offset
// (symbol@{gd|ie|le}) for TLS variables, depending on the TLS model.
// On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending
// on the TLS access method (or model). For the general-dynamic access
// method, we also have region handle (symbol@m) for each variable. For
// local-dynamic, there is a module handle (_$TLSML[TC]@ml) for all
// variables. Finally for local-exec and initial-exec, we have a thread
// pointer, in r13 for 64-bit mode and returned by .__get_tpointer for
// 32-bit mode.
if (Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGD ||
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM ||
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSIE ||
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLE)
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLE ||
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLD ||
Kind == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSML)
OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@"
<< MCSymbolRefExpr::getVariantKindName(Kind) << '\n';
else
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
return {XCOFF::RelocationType::R_TLS_IE, SignAndSizeForFKData};
case MCSymbolRefExpr::VK_PPC_AIX_TLSLE:
return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForFKData};
case MCSymbolRefExpr::VK_PPC_AIX_TLSLD:
return {XCOFF::RelocationType::R_TLS_LD, SignAndSizeForFKData};
case MCSymbolRefExpr::VK_PPC_AIX_TLSML:
return {XCOFF::RelocationType::R_TLSML, SignAndSizeForFKData};
case MCSymbolRefExpr::VK_None:
return {XCOFF::RelocationType::R_POS, SignAndSizeForFKData};
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/PowerPC/PPC.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ class ModulePass;
/// and Local Exec models.
MO_TPREL_FLAG,

/// MO_TLSLDM_FLAG - on AIX the ML relocation type is only valid for a
/// reference to a TOC symbol from the symbol itself, and right now its only
/// user is the symbol "_$TLSML". The symbol name is used to decide that
/// the R_TLSML relocation is expected.
MO_TLSLDM_FLAG,

/// MO_TLSLD_FLAG - If this bit is set the symbol reference is relative to
/// TLS Local Dynamic model.
MO_TLSLD_FLAG,
Expand Down
58 changes: 45 additions & 13 deletions llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,12 +621,23 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
}

/// This helper function creates the TlsGetAddr MCSymbol for AIX. We will
/// create the csect and use the qual-name symbol instead of creating just the
/// external symbol.
/// This helper function creates the TlsGetAddr/TlsGetMod MCSymbol for AIX. We
/// will create the csect and use the qual-name symbol instead of creating just
/// the external symbol.
static MCSymbol *createMCSymbolForTlsGetAddr(MCContext &Ctx, unsigned MIOpc) {
StringRef SymName =
MIOpc == PPC::GETtlsTpointer32AIX ? ".__get_tpointer" : ".__tls_get_addr";
StringRef SymName;
switch (MIOpc) {
default:
SymName = ".__tls_get_addr";
break;
case PPC::GETtlsTpointer32AIX:
SymName = ".__get_tpointer";
break;
case PPC::GETtlsMOD32AIX:
case PPC::GETtlsMOD64AIX:
SymName = ".__tls_get_mod";
break;
}
return Ctx
.getXCOFFSection(SymName, SectionKind::getText(),
XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER))
Expand Down Expand Up @@ -668,14 +679,16 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
"GETtls[ld]ADDR[32] must read GPR3");

if (Subtarget->isAIXABI()) {
// On AIX, the variable offset should already be in R4 and the region handle
// should already be in R3.
// For TLSGD, which currently is the only supported access model, we only
// need to generate an absolute branch to .__tls_get_addr.
// For TLSGD, the variable offset should already be in R4 and the region
// handle should already be in R3. We generate an absolute branch to
// .__tls_get_addr. For TLSLD, the module handle should already be in R3.
// We generate an absolute branch to .__tls_get_mod.
Register VarOffsetReg = Subtarget->isPPC64() ? PPC::X4 : PPC::R4;
(void)VarOffsetReg;
assert(MI->getOperand(2).isReg() &&
MI->getOperand(2).getReg() == VarOffsetReg &&
assert((MI->getOpcode() == PPC::GETtlsMOD32AIX ||
MI->getOpcode() == PPC::GETtlsMOD64AIX ||
(MI->getOperand(2).isReg() &&
MI->getOperand(2).getReg() == VarOffsetReg)) &&
"GETtls[ld]ADDR[32] must read GPR4");
EmitAIXTlsCallHelper(MI);
return;
Expand Down Expand Up @@ -844,6 +857,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM;
if (Flag == PPCII::MO_TLSGD_FLAG || Flag == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGD;
// For local-dynamic TLS access on AIX, we have one TOC entry for the symbol
// (the variable offset) and one shared TOC entry for the module handle.
// They are differentiated by MO_TLSLD_FLAG and MO_TLSLDM_FLAG.
if (Flag == PPCII::MO_TLSLD_FLAG && IsAIX)
return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLD;
if (Flag == PPCII::MO_TLSLDM_FLAG && IsAIX)
return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSML;
return MCSymbolRefExpr::VariantKind::VK_None;
};

Expand Down Expand Up @@ -1354,6 +1374,11 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addExpr(SymGotTlsGD));
return;
}
case PPC::GETtlsMOD32AIX:
case PPC::GETtlsMOD64AIX:
// Transform: %r3 = GETtlsMODNNAIX %r3 (for NN == 32/64).
// Into: BLA .__tls_get_mod()
// Input parameter is a module handle (_$TLSML[TC]@ml) for all variables.
case PPC::GETtlsADDR:
// Transform: %x3 = GETtlsADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
Expand Down Expand Up @@ -2167,6 +2192,11 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
}
}

// Do not emit the _$TLSML symbol.
if (GV->getThreadLocalMode() == GlobalVariable::LocalDynamicTLSModel &&
GV->hasName() && GV->getName() == "_$TLSML")
return;

OutStreamer->emitXCOFFSymbolLinkageWithVisibility(GVSym, LinkageAttr,
VisibilityAttr);
}
Expand Down Expand Up @@ -2981,11 +3011,13 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
MMI->hasDebugInfo());
break;
}
case PPC::GETtlsMOD32AIX:
case PPC::GETtlsMOD64AIX:
case PPC::GETtlsTpointer32AIX:
case PPC::GETtlsADDR64AIX:
case PPC::GETtlsADDR32AIX: {
// A reference to .__tls_get_addr/.__get_tpointer is unknown to the
// assembler so we need to emit an external symbol reference.
// A reference to .__tls_get_mod/.__tls_get_addr/.__get_tpointer is unknown
// to the assembler so we need to emit an external symbol reference.
MCSymbol *TlsGetAddr =
createMCSymbolForTlsGetAddr(OutContext, MI->getOpcode());
ExtSymSDNodeSymbols.insert(TlsGetAddr);
Expand Down
39 changes: 32 additions & 7 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1774,9 +1774,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";
case PPCISD::GET_TPOINTER: return "PPCISD::GET_TPOINTER";
case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX";
case PPCISD::TLSLD_AIX: return "PPCISD::TLSLD_AIX";
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
Expand Down Expand Up @@ -3415,13 +3417,36 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);
}

// Only Local-Exec, Initial-Exec and General-Dynamic TLS models are currently
// supported models. If Local- or Initial-exec are not possible or specified,
// all GlobalTLSAddress nodes are lowered using the general-dynamic model.
// We need to generate two TOC entries, one for the variable offset, one for
// the region handle. The global address for the TOC entry of the region
// handle is created with the MO_TLSGDM_FLAG flag and the global address
// for the TOC entry of the variable offset is created with MO_TLSGD_FLAG.
if (Model == TLSModel::LocalDynamic) {
// For local-dynamic on AIX, we need to generate one TOC entry for each
// variable offset, and a single module-handle TOC entry for the entire
// file.

SDValue VariableOffsetTGA =
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSLD_FLAG);
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);

Module *M = DAG.getMachineFunction().getFunction().getParent();
GlobalVariable *TLSGV =
dyn_cast_or_null<GlobalVariable>(M->getOrInsertGlobal(
StringRef("_$TLSML"), PointerType::getUnqual(*DAG.getContext())));
TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
assert(TLSGV && "Not able to create GV for _$TLSML.");
SDValue ModuleHandleTGA =
DAG.getTargetGlobalAddress(TLSGV, dl, PtrVT, 0, PPCII::MO_TLSLDM_FLAG);
SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, ModuleHandleTGA);
SDValue ModuleHandle =
DAG.getNode(PPCISD::TLSLD_AIX, dl, PtrVT, ModuleHandleTOC);

return DAG.getNode(ISD::ADD, dl, PtrVT, ModuleHandle, VariableOffset);
}

// If Local- or Initial-exec or Local-dynamic is not possible or specified,
// all GlobalTLSAddress nodes are lowered using the general-dynamic model. We
// need to generate two TOC entries, one for the variable offset, one for the
// region handle. The global address for the TOC entry of the region handle is
// created with the MO_TLSGDM_FLAG flag and the global address for the TOC
// entry of the variable offset is created with MO_TLSGD_FLAG.
SDValue VariableOffsetTGA =
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGD_FLAG);
SDValue RegionHandleTGA =
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,22 @@ namespace llvm {
/// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
/// Op that combines two register copies of TOC entries
/// (region handle into R3 and variable offset into R4) followed by a
/// GET_TLS_ADDR node which will be expanded to a call to __get_tls_addr.
/// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
/// This node is used in 64-bit mode as well (in which case the result is
/// G8RC and inputs are X3/X4).
TLSGD_AIX,

/// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
/// produces a call to .__tls_get_mod(_$TLSML\@ml).
GET_TLS_MOD_AIX,

/// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
/// Op that requires a single input of the module handle TOC entry in R3,
/// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
/// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
/// The only difference is the register class.
TLSLD_AIX,

/// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
/// model, produces an ADDIS8 instruction that adds the GOT base
/// register to sym\@got\@tlsld\@ha.
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Target/PowerPC/PPCInstr64Bit.td
Original file line number Diff line number Diff line change
Expand Up @@ -1557,12 +1557,19 @@ def GETtlsldADDRPCREL : GETtlsldADDRPseudo <"#GETtlsldADDRPCREL">;
// so we don't need to mark it with a size of 8 bytes. Finally, the assembly
// manual mentions this exact set of registers as the clobbered set, others
// are guaranteed not to be clobbered.
let Defs = [X0,X4,X5,X11,LR8,CR0] in
let Defs = [X0,X4,X5,X11,LR8,CR0] in {
def GETtlsADDR64AIX :
PPCEmitTimePseudo<(outs g8rc:$rD),(ins g8rc:$offset, g8rc:$handle),
"GETtlsADDR64AIX",
[(set i64:$rD,
(PPCgetTlsAddr i64:$offset, i64:$handle))]>, isPPC64;
// On AIX, the call to .__tls_get_mod needs one input in X3 for the module handle.
def GETtlsMOD64AIX :
PPCEmitTimePseudo<(outs g8rc:$rD),(ins g8rc:$handle),
"GETtlsMOD64AIX",
[(set i64:$rD,
(PPCgetTlsMod i64:$handle))]>, isPPC64;
}
}

// Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8
Expand Down Expand Up @@ -1595,6 +1602,9 @@ def TLSGDAIX8 :
"#TLSGDAIX8",
[(set i64:$rD,
(PPCTlsgdAIX i64:$offset, i64:$handle))]>;
// This pseudo is expanded to the call to GETtlsMOD64AIX.
def TLSLDAIX8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$handle),
"#TLSLDAIX8", [(set i64:$rD, (PPCTlsldAIX i64:$handle))]>;
// Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8
// are true defines, while the rest of the Defs are clobbers.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2965,6 +2965,7 @@ PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
{MO_PCREL_OPT_FLAG, "ppc-opt-pcrel"},
{MO_TLSGD_FLAG, "ppc-tlsgd"},
{MO_TPREL_FLAG, "ppc-tprel"},
{MO_TLSLDM_FLAG, "ppc-tlsldm"},
{MO_TLSLD_FLAG, "ppc-tlsld"},
{MO_TLSGDM_FLAG, "ppc-tlsgdm"},
{MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"},
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Target/PowerPC/PPCInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,14 @@ def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
def PPCgetTlsMod : SDNode<"PPCISD::GET_TLS_MOD_AIX", SDTIntUnaryOp>;
def PPCgetTpointer : SDNode<"PPCISD::GET_TPOINTER", SDTIntLeaf, []>;
def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
def PPCTlsgdAIX : SDNode<"PPCISD::TLSGD_AIX", SDTIntBinOp>;
def PPCTlsldAIX : SDNode<"PPCISD::TLSLD_AIX", SDTIntUnaryOp>;
def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
Expand Down Expand Up @@ -3249,11 +3251,16 @@ def GETtlsADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$s
// The rest of the Defs are the exact set of registers that will be clobbered by
// the call.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
Defs = [R0,R4,R5,R11,LR,CR0] in
Defs = [R0,R4,R5,R11,LR,CR0] in {
def GETtlsADDR32AIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$offset, gprc:$handle),
"GETtlsADDR32AIX",
[(set i32:$rD,
(PPCgetTlsAddr i32:$offset, i32:$handle))]>;
def GETtlsMOD32AIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$handle),
"GETtlsMOD32AIX",
[(set i32:$rD,
(PPCgetTlsMod i32:$handle))]>;
}

// For local-exec accesses on 32-bit AIX, a call to .__get_tpointer is
// generated to retrieve the thread pointer. GETtlsTpointer32AIX clobbers both
Expand Down Expand Up @@ -3293,6 +3300,9 @@ def TLSGDAIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$offset, gprc:$handl
"#TLSGDAIX",
[(set i32:$rD,
(PPCTlsgdAIX i32:$offset, i32:$handle))]>;
// This pseudo is expanded to the call to GETtlsMOD32AIX.
def TLSLDAIX : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$handle),
"#TLSLDAIX", [(set i32:$rD, (PPCTlsldAIX i32:$handle))]>;
// LR is a true define, while the rest of the Defs are clobbers. R3 is
// explicitly defined when this op is created, so not mentioned here.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
Expand Down
119 changes: 113 additions & 6 deletions llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,15 @@ namespace {
bool processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
bool NeedFence = true;
bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
bool IsAIX = MBB.getParent()->getSubtarget<PPCSubtarget>().isAIXABI();
const PPCSubtarget &Subtarget =
MBB.getParent()->getSubtarget<PPCSubtarget>();
bool Is64Bit = Subtarget.isPPC64();
bool IsAIX = Subtarget.isAIXABI();
bool IsLargeModel =
Subtarget.getTargetMachine().getCodeModel() == CodeModel::Large;
bool IsPCREL = false;
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &RegInfo = MF->getRegInfo();

for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
I != IE;) {
Expand All @@ -59,13 +65,16 @@ namespace {
// There are a number of slight differences in code generation
// when we call .__get_tpointer (32-bit AIX TLS).
bool IsTLSTPRelMI = MI.getOpcode() == PPC::GETtlsTpointer32AIX;
bool IsTLSLDAIXMI = (MI.getOpcode() == PPC::TLSLDAIX8 ||
MI.getOpcode() == PPC::TLSLDAIX);

if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
MI.getOpcode() != PPC::ADDItlsldLADDR &&
MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
MI.getOpcode() != PPC::ADDItlsldLADDR32 &&
MI.getOpcode() != PPC::TLSGDAIX &&
MI.getOpcode() != PPC::TLSGDAIX8 && !IsTLSTPRelMI && !IsPCREL) {
MI.getOpcode() != PPC::TLSGDAIX8 && !IsTLSTPRelMI && !IsPCREL &&
!IsTLSLDAIXMI) {
// Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
// as scheduling fences, we skip creating fences if we already
// have existing ADJCALLSTACKDOWN/UP to avoid nesting,
Expand Down Expand Up @@ -109,6 +118,16 @@ namespace {
Opc1 = PPC::ADDItlsldL32;
Opc2 = PPC::GETtlsldADDR32;
break;
case PPC::TLSLDAIX:
// TLSLDAIX is expanded to one copy and GET_TLS_MOD, so we only set
// Opc2 here.
Opc2 = PPC::GETtlsMOD32AIX;
break;
case PPC::TLSLDAIX8:
// TLSLDAIX8 is expanded to one copy and GET_TLS_MOD, so we only set
// Opc2 here.
Opc2 = PPC::GETtlsMOD64AIX;
break;
case PPC::TLSGDAIX8:
// TLSGDAIX8 is expanded to two copies and GET_TLS_ADDR, so we only
// set Opc2 here.
Expand Down Expand Up @@ -145,9 +164,97 @@ namespace {
.addImm(0);

if (IsAIX) {
// The variable offset and region handle are copied in r4 and r3. The
// copies are followed by GETtlsADDR32AIX/GETtlsADDR64AIX.
if (!IsTLSTPRelMI) {
if (IsTLSLDAIXMI) {
// The relative order between the node that loads the variable
// offset from the TOC, and the .__tls_get_mod node is being tuned
// here. It is better to put the variable offset TOC load after the
// call, since this node can use clobbers r4/r5.
// Search for the pattern of the two nodes that load from the TOC
// (either for the variable offset or for the module handle), and
// then move the variable offset TOC load right before the node that
// uses the OutReg of the .__tls_get_mod node.
unsigned LDTocOp =
Is64Bit ? (IsLargeModel ? PPC::LDtocL : PPC::LDtoc)
: (IsLargeModel ? PPC::LWZtocL : PPC::LWZtoc);
if (!RegInfo.use_empty(OutReg)) {
std::set<MachineInstr *> Uses;
// Collect all instructions that use the OutReg.
for (MachineOperand &MO : RegInfo.use_operands(OutReg))
Uses.insert(MO.getParent());
// Find the first user (e.g.: lwax/stfdx) of the OutReg within the
// current BB.
MachineBasicBlock::iterator UseIter = MBB.begin();
for (MachineBasicBlock::iterator IE = MBB.end(); UseIter != IE;
++UseIter)
if (Uses.count(&*UseIter))
break;

// Additional handling is required when UserIter (the first user
// of OutReg) is pointing to a valid node that loads from the TOC.
// Check the pattern and do the movement if the pattern matches.
if (UseIter != MBB.end()) {
// Collect all associated nodes that load from the TOC. Use
// hasOneDef() to guard against unexpected scenarios.
std::set<MachineInstr *> LoadFromTocs;
for (MachineOperand &MO : UseIter->operands())
if (MO.isReg() && MO.isUse()) {
Register MOReg = MO.getReg();
if (RegInfo.hasOneDef(MOReg)) {
MachineInstr *Temp =
RegInfo.getOneDef(MOReg)->getParent();
// For the current TLSLDAIX node, get the corresponding
// node that loads from the TOC for the InReg. Otherwise,
// Temp probably pointed to the variable offset TOC load
// we would like to move.
if (Temp == &MI && RegInfo.hasOneDef(InReg))
Temp = RegInfo.getOneDef(InReg)->getParent();
if (Temp->getOpcode() == LDTocOp)
LoadFromTocs.insert(Temp);
} else {
// FIXME: analyze this scenario if there is one.
LoadFromTocs.clear();
break;
}
}

// Check the two nodes that loaded from the TOC: one should be
// "_$TLSML", and the other will be moved before the node that
// uses the OutReg of the .__tls_get_mod node.
if (LoadFromTocs.size() == 2) {
MachineBasicBlock::iterator TLSMLIter = MBB.end();
MachineBasicBlock::iterator OffsetIter = MBB.end();
// Make sure the two nodes that loaded from the TOC are within
// the current BB, and that one of them is from the "_$TLSML"
// pseudo symbol, while the other is from the variable.
for (MachineBasicBlock::iterator I = MBB.begin(),
IE = MBB.end();
I != IE; ++I)
if (LoadFromTocs.count(&*I)) {
MachineOperand MO = I->getOperand(1);
if (MO.isGlobal() && MO.getGlobal()->hasName() &&
MO.getGlobal()->getName() == "_$TLSML")
TLSMLIter = I;
else
OffsetIter = I;
}
// Perform the movement when the desired scenario has been
// identified, which should be when both of the iterators are
// valid.
if (TLSMLIter != MBB.end() && OffsetIter != MBB.end())
OffsetIter->moveBefore(&*UseIter);
}
}
}
// The module-handle is copied into r3. The copy is followed by
// GETtlsMOD32AIX/GETtlsMOD64AIX.
BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), GPR3)
.addReg(InReg);
// The call to .__tls_get_mod.
BuildMI(MBB, I, DL, TII->get(Opc2), GPR3).addReg(GPR3);
} else if (!IsTLSTPRelMI) {
// The variable offset and region handle (for TLSGD) are copied in
// r4 and r3. The copies are followed by
// GETtlsADDR32AIX/GETtlsADDR64AIX.
BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), GPR4)
.addReg(MI.getOperand(1).getReg());
BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), GPR3)
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/RISCV/RISCVRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -508,19 +508,23 @@ defvar VM8VTs = [vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
vfloat16m8_t, vbfloat16m8_t,
vfloat32m8_t, vfloat64m8_t];

// We reverse the order of last 8 registers so that we don't needlessly prevent
// allocation of higher lmul register groups while still putting v0 last in the
// allocation order.

def VR : VReg<!listconcat(VM1VTs, VMaskVTs),
(add (sequence "V%u", 8, 31),
(sequence "V%u", 0, 7)), 1>;
(sequence "V%u", 7, 0)), 1>;

def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs), (sub VR, V0), 1>;

def VRM2 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
(sequence "V%uM2", 0, 7, 2)), 2>;
(sequence "V%uM2", 6, 0, 2)), 2>;

def VRM2NoV0 : VReg<VM2VTs, (sub VRM2, V0M2), 2>;

def VRM4 : VReg<VM4VTs, (add V8M4, V12M4, V16M4, V20M4,
V24M4, V28M4, V0M4, V4M4), 4>;
V24M4, V28M4, V4M4, V0M4), 4>;

def VRM4NoV0 : VReg<VM4VTs, (sub VRM4, V0M4), 4>;

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86CompressEVEX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
const MCInstrDesc &Desc = MI.getDesc();
Register Reg0 = MI.getOperand(0).getReg();
const MachineOperand &Op1 = MI.getOperand(1);
if (!Op1.isReg())
if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1)
return false;
Register Reg1 = Op1.getReg();
if (Reg1 == Reg0)
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23422,6 +23422,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getBitcast(VT, Result);
}

// If the i64 elements are sign-extended enough to be representable as i32
// then we can compare the lower i32 bits and splat.
if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
DAG.ComputeNumSignBits(Op1) > 32) {
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
static const int MaskLo[] = {0, 0, 2, 2};
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

return DAG.getBitcast(VT, Result);
}

// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
Expand Down
81 changes: 58 additions & 23 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Original file line number Diff line number Diff line change
Expand Up @@ -1493,27 +1493,71 @@ def : Pat<(xor GR32:$src1, -2147483648),

// Odd encoding trick: -128 fits into an 8-bit immediate field while
// +128 doesn't, so in this special case use a sub instead of an add.
def : Pat<(add GR16:$src1, 128),
(SUB16ri GR16:$src1, -128)>;
let Predicates = [NoNDD] in {
def : Pat<(add GR16:$src1, 128),
(SUB16ri GR16:$src1, -128)>;
def : Pat<(add GR32:$src1, 128),
(SUB32ri GR32:$src1, -128)>;
def : Pat<(add GR64:$src1, 128),
(SUB64ri32 GR64:$src1, -128)>;

def : Pat<(X86add_flag_nocf GR16:$src1, 128),
(SUB16ri GR16:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR32:$src1, 128),
(SUB32ri GR32:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 128),
(SUB64ri32 GR64:$src1, -128)>;
}
let Predicates = [HasNDD] in {
def : Pat<(add GR16:$src1, 128),
(SUB16ri_ND GR16:$src1, -128)>;
def : Pat<(add GR32:$src1, 128),
(SUB32ri_ND GR32:$src1, -128)>;
def : Pat<(add GR64:$src1, 128),
(SUB64ri32_ND GR64:$src1, -128)>;

def : Pat<(X86add_flag_nocf GR16:$src1, 128),
(SUB16ri_ND GR16:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR32:$src1, 128),
(SUB32ri_ND GR32:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 128),
(SUB64ri32_ND GR64:$src1, -128)>;
}
def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
(SUB16mi addr:$dst, -128)>;

def : Pat<(add GR32:$src1, 128),
(SUB32ri GR32:$src1, -128)>;
def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
(SUB32mi addr:$dst, -128)>;

def : Pat<(add GR64:$src1, 128),
(SUB64ri32 GR64:$src1, -128)>;
def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
(SUB64mi32 addr:$dst, -128)>;
let Predicates = [HasNDD] in {
def : Pat<(add (loadi16 addr:$src), 128),
(SUB16mi_ND addr:$src, -128)>;
def : Pat<(add (loadi32 addr:$src), 128),
(SUB32mi_ND addr:$src, -128)>;
def : Pat<(add (loadi64 addr:$src), 128),
(SUB64mi32_ND addr:$src, -128)>;
}

def : Pat<(X86add_flag_nocf GR16:$src1, 128),
(SUB16ri GR16:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR32:$src1, 128),
(SUB32ri GR32:$src1, -128)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 128),
(SUB64ri32 GR64:$src1, -128)>;
// The same trick applies for 32-bit immediate fields in 64-bit
// instructions.
let Predicates = [NoNDD] in {
def : Pat<(add GR64:$src1, 0x0000000080000000),
(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
}
let Predicates = [HasNDD] in {
def : Pat<(add GR64:$src1, 0x0000000080000000),
(SUB64ri32_ND GR64:$src1, 0xffffffff80000000)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
(SUB64ri32_ND GR64:$src1, 0xffffffff80000000)>;
}
def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
(SUB64mi32 addr:$dst, 0xffffffff80000000)>;
let Predicates = [HasNDD] in {
def : Pat<(add(loadi64 addr:$src), 0x0000000080000000),
(SUB64mi32_ND addr:$src, 0xffffffff80000000)>;
}

// Depositing value to 8/16 bit subreg:
def : Pat<(or (and GR64:$dst, -256),
Expand All @@ -1532,15 +1576,6 @@ def : Pat<(or (and GR32:$dst, -65536),
(i32 (zextloadi16 addr:$src))),
(INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm i16mem:$src), sub_16bit)>;

// The same trick applies for 32-bit immediate fields in 64-bit
// instructions.
def : Pat<(add GR64:$src1, 0x0000000080000000),
(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
(SUB64mi32 addr:$dst, 0xffffffff80000000)>;
def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;

// To avoid needing to materialize an immediate in a register, use a 32-bit and
// with implicit zero-extension instead of a 64-bit and if the immediate has at
// least 32 bits of leading zeros. If in addition the last 32 bits can be
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/TargetParser/RISCVTargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "llvm/TargetParser/RISCVTargetParser.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/RISCVISAInfo.h"
#include "llvm/TargetParser/Triple.h"

namespace llvm {
Expand Down Expand Up @@ -95,5 +96,28 @@ void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64) {
#include "llvm/TargetParser/RISCVTargetParserDef.inc"
}

// This function is currently used by IREE, so it's not dead code.
void getFeaturesForCPU(StringRef CPU,
SmallVectorImpl<std::string> &EnabledFeatures,
bool NeedPlus) {
StringRef MarchFromCPU = llvm::RISCV::getMArchFromMcpu(CPU);
if (MarchFromCPU == "")
return;

EnabledFeatures.clear();
auto RII = RISCVISAInfo::parseArchString(
MarchFromCPU, /* EnableExperimentalExtension */ true);

if (llvm::errorToBool(RII.takeError()))
return;

std::vector<std::string> FeatStrings =
(*RII)->toFeatures(/* AddAllExtensions */ false);
for (const auto &F : FeatStrings)
if (NeedPlus)
EnabledFeatures.push_back(F);
else
EnabledFeatures.push_back(F.substr(1));
}
} // namespace RISCV
} // namespace llvm
2 changes: 1 addition & 1 deletion llvm/lib/TextAPI/RecordVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ void SymbolConverter::visitObjCInterface(const ObjCInterfaceRecord &ObjCR) {
}

void SymbolConverter::visitObjCCategory(const ObjCCategoryRecord &Cat) {
addIVars(Cat.getObjCIVars(), Cat.getName());
addIVars(Cat.getObjCIVars(), Cat.getSuperClassName());
}
36 changes: 36 additions & 0 deletions llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/StackSafetyAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
Expand Down Expand Up @@ -177,6 +180,18 @@ static cl::opt<bool> ClWithTls(
"platforms that support this"),
cl::Hidden, cl::init(true));

static cl::opt<bool>
CSkipHotCode("hwasan-skip-hot-code",
cl::desc("Do not instument hot functions based on FDO."),
cl::Hidden, cl::init(false));

static cl::opt<int> HotPercentileCutoff("hwasan-percentile-cutoff-hot",
cl::init(0));

STATISTIC(NumTotalFuncs, "Number of total funcs HWASAN");
STATISTIC(NumInstrumentedFuncs, "Number of HWASAN instrumented funcs");
STATISTIC(NumNoProfileSummaryFuncs, "Number of HWASAN funcs without PS");

// Mode for selecting how to insert frame record info into the stack ring
// buffer.
enum RecordStackHistoryMode {
Expand Down Expand Up @@ -1507,6 +1522,27 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
return;

if (F.empty())
return;

NumTotalFuncs++;
if (CSkipHotCode) {
auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
if (PSI && PSI->hasProfileSummary()) {
auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
if ((HotPercentileCutoff.getNumOccurrences() && HotPercentileCutoff >= 0)
? PSI->isFunctionHotInCallGraphNthPercentile(HotPercentileCutoff,
&F, BFI)
: PSI->isFunctionHotInCallGraph(&F, BFI))
return;
} else {
++NumNoProfileSummaryFuncs;
}
}
NumInstrumentedFuncs++;

LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");

SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
Expand Down
72 changes: 41 additions & 31 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
Expand Down Expand Up @@ -192,6 +193,10 @@ static cl::opt<bool>
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;

// Limit of the number of uses for potentially transformed instructions/values,
// used in checks to avoid compile-time explode.
static constexpr int UsesLimit = 8;

// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
Expand Down Expand Up @@ -940,7 +945,6 @@ static bool isUsedOutsideBlock(Value *V) {
if (!I)
return true;
// Limits the number of uses to save compile time.
constexpr int UsesLimit = 8;
return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
all_of(I->users(), [I](User *U) {
auto *IU = dyn_cast<Instruction>(U);
Expand Down Expand Up @@ -1284,8 +1288,7 @@ class BoUpSLP {
// Retruns true if the users of V1 and V2 won't need to be extracted.
auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
// Bail out if we have too many uses to save compilation time.
static constexpr unsigned Limit = 8;
if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
return false;

auto AllUsersVectorized = [U1, U2, this](Value *V) {
Expand Down Expand Up @@ -4174,14 +4177,14 @@ static LoadsState canVectorizeLoads(const BoUpSLP &R, ArrayRef<Value *> VL,
// increases the cost.
Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
bool ProfitableGatherPointers =
static_cast<unsigned>(count_if(
PointerOps,
[L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
Sz > 2;
L && Sz > 2 && count_if(PointerOps, [L](Value *V) {
return L->isLoopInvariant(V);
}) <= Sz / 2;
if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
(GEP && GEP->getNumOperands() == 2);
(GEP && GEP->getNumOperands() == 2 &&
isa<Constant, Instruction>(GEP->getOperand(1)));
})) {
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
Expand Down Expand Up @@ -4475,24 +4478,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
if (!areTwoInsertFromSameBuildVector(
IE1, IE2,
[](InsertElementInst *II) { return II->getOperand(0); }))
return false;
std::optional<unsigned> Idx1 = getInsertIndex(IE1);
std::optional<unsigned> Idx2 = getInsertIndex(IE2);
if (Idx1 == std::nullopt || Idx2 == std::nullopt)
return false;
return *Idx1 < *Idx2;
return I1 < I2;
return getInsertIndex(IE1) < getInsertIndex(IE2);
}
if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
if (EE1->getOperand(0) != EE2->getOperand(0))
return false;
std::optional<unsigned> Idx1 = getExtractIndex(EE1);
std::optional<unsigned> Idx2 = getExtractIndex(EE2);
if (Idx1 == std::nullopt || Idx2 == std::nullopt)
return false;
return *Idx1 < *Idx2;
return I1 < I2;
return getInsertIndex(EE1) < getInsertIndex(EE2);
}
return false;
return I1 < I2;
};
auto IsIdentityOrder = [](const OrdersType &Order) {
for (unsigned Idx : seq<unsigned>(0, Order.size()))
Expand Down Expand Up @@ -5317,8 +5312,7 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
Value *V = TE->Scalars[Lane];
// To save compilation time we don't visit if we have too many users.
static constexpr unsigned UsersLimit = 4;
if (V->hasNUsesOrMore(UsersLimit))
if (V->hasNUsesOrMore(UsesLimit))
break;

// Collect stores per pointer object.
Expand Down Expand Up @@ -7689,7 +7683,24 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
SameNodesEstimated = false;
Cost += createShuffle(&E1, E2, Mask);
transformMaskAfterShuffle(CommonMask, Mask);
if (!E2 && InVectors.size() == 1) {
unsigned VF = E1.getVectorFactor();
if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
VF = std::max(VF,
cast<FixedVectorType>(V1->getType())->getNumElements());
} else {
const auto *E = InVectors.front().get<const TreeEntry *>();
VF = std::max(VF, E->getVectorFactor());
}
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
CommonMask[Idx] = Mask[Idx] + VF;
Cost += createShuffle(InVectors.front(), &E1, CommonMask);
transformMaskAfterShuffle(CommonMask, CommonMask);
} else {
Cost += createShuffle(&E1, E2, Mask);
transformMaskAfterShuffle(CommonMask, Mask);
}
}

class ShuffleCostBuilder {
Expand Down Expand Up @@ -13974,12 +13985,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
if (Idx != Set.size() - 1)
continue;
}
if (Operands.size() <= 1) {
auto E = make_scope_exit([&, &DataVar = Data]() {
Operands.clear();
Operands.push_back(Stores[Data.first]);
PrevDist = Data.second;
Operands.push_back(Stores[DataVar.first]);
PrevDist = DataVar.second;
});

if (Operands.size() <= 1)
continue;
}

unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Operands[0]);
Expand Down Expand Up @@ -14038,9 +14051,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
if (StartIdx >= Operands.size())
break;
}
Operands.clear();
Operands.push_back(Stores[Data.first]);
PrevDist = Data.second;
}
};

Expand Down
Loading