1,430 changes: 715 additions & 715 deletions clang/include/clang/Basic/arm_neon.td

Large diffs are not rendered by default.

69 changes: 30 additions & 39 deletions clang/include/clang/Basic/arm_neon_incl.td
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,8 @@ def OP_UNAVAILABLE : Operation {
//
// The prototype is a string that defines the return type of the intrinsic
// and the type of each argument. The return type and every argument gets a
// "modifier" that can change in some way the "base type" of the intrinsic.
//
// The modifier 'd' means "default" and does not modify the base type in any
// way. The available modifiers are given below.
// set of "modifiers" that can change in some way the "base type" of the
// intrinsic.
//
// Typespecs
// ---------
Expand All @@ -226,41 +224,34 @@ def OP_UNAVAILABLE : Operation {
// -------------------
// prototype: return (arg, arg, ...)
//
// v: void
// t: best-fit integer (int/poly args)
// x: signed integer (int/float args)
// u: unsigned integer (int/float args)
// f: float (int args)
// F: double (int args)
// H: half (int args)
// 0: half (int args), ignore 'Q' size modifier.
// 1: half (int args), force 'Q' size modifier.
// d: default
// g: default, ignore 'Q' size modifier.
// j: default, force 'Q' size modifier.
// w: double width elements, same num elts
// n: double width elements, half num elts
// h: half width elements, double num elts
// q: half width elements, quad num elts
// e: half width elements, double num elts, unsigned
// m: half width elements, same num elts
// i: constant int
// l: constant uint64
// s: scalar of element type
// z: scalar of half width element type, signed
// r: scalar of double width element type, signed
// b: scalar of unsigned integer/long type (int/float args)
// $: scalar of signed integer/long type (int/float args)
// y: scalar of float
// o: scalar of double
// k: default elt width, double num elts
// 2,3,4: array of default vectors
// B,C,D: array of default elts, force 'Q' size modifier.
// p: pointer type
// c: const pointer type
// 7: vector of 8-bit elements, ignore 'Q' size modifier
// 8: vector of 8-bit elements, same width as default type
// 9: vector of 8-bit elements, force 'Q' size modifier
// Each type modifier is either a single character, or a group surrounded by
// parentheses.
//
// .: default
// v: change to void category.
// S: change to signed integer category.
// U: change to unsigned integer category.
// F: change to floating category.
// P: change to polynomial category.
// p: change polynomial to equivalent integer category. Otherwise nop.
//
// >: double element width (vector size unchanged).
// <: half element width (vector size unchanged).
//
// 1: change to scalar.
// 2: change to struct of two vectors.
// 3: change to struct of three vectors.
// 4: change to struct of four vectors.
//
// *: make a pointer argument.
// c: make a constant argument (for pointers).
//
// Q: force 128-bit width.
// q: force 64-bit width.
//
// I: make 32-bit signed scalar immediate
// !: make this the key type passed to CGBuiltin.cpp in a polymorphic call.


// Every intrinsic subclasses Inst.
class Inst <string n, string p, string t, Operation o> {
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5456,6 +5456,11 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
}
case NEON::BI__builtin_neon_vcvtx_f32_v: {
llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);

}
case NEON::BI__builtin_neon_vext_v:
case NEON::BI__builtin_neon_vextq_v: {
int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
Expand Down
4 changes: 0 additions & 4 deletions clang/test/CodeGen/aarch64-neon-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -17756,8 +17756,6 @@ float32_t test_vminnmv_f32(float32x2_t a) {
}

// CHECK-LABEL: @test_vpaddq_s64(
// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
// CHECK: ret <2 x i64> [[VPADDQ_V2_I]]
Expand All @@ -17766,8 +17764,6 @@ int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
}

// CHECK-LABEL: @test_vpaddq_u64(
// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
// CHECK: ret <2 x i64> [[VPADDQ_V2_I]]
Expand Down
16 changes: 6 additions & 10 deletions clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
Original file line number Diff line number Diff line change
Expand Up @@ -407,12 +407,10 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
}

// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> <double 0x3FD6304BC43AB5C2>, i32 0
// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> <double 0x3FEE211E215AEEF3>, i32 0
// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> <double 0x3FD6304BC43AB5C2>, double [[VMULXD_F64_I]], i32 0
// CHECK: ret <1 x double> [[VSET_LANE]]
float64x1_t test_vmulx_lane_f64_0() {
float64x1_t arg1;
Expand All @@ -426,13 +424,11 @@ float64x1_t test_vmulx_lane_f64_0() {
}

// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #1 {
// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> <double 0x3FD6304BC43AB5C2>, <1 x double> <double 0x3FEE211E215AEEF3>, <2 x i32> <i32 0, i32 1>
// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> <double 0x3FD6304BC43AB5C2>, i32 0
// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> <double 0x3FD6304BC43AB5C2>, double [[VMULXD_F64_I]], i32 0
// CHECK: ret <1 x double> [[VSET_LANE]]
float64x1_t test_vmulx_laneq_f64_2() {
float64x1_t arg1;
Expand Down
457 changes: 165 additions & 292 deletions clang/utils/TableGen/NeonEmitter.cpp

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions clang/utils/convert_arm_neon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env python3

# This script was committed on 20/11/2019 and it would probably make sense to remove
# it after the next release branches.

# This script is pipe based and converts an arm_neon.td (or arm_fp16.td) file
# using the old single-char type modifiers to an equivalent new-style form where
# each modifier is orthogonal and they can be composed.
#
# It was used to directly generate the .td files on master, so if you have any
# local additions I would suggest implementing any modifiers here, and running
# it over your entire pre-merge .td files rather than trying to resolve any
# conflicts manually.

import re, sys
MOD_MAP = {
'v': 'v',
'x': 'S',
'u': 'U',
'd': '.',
'g': 'q',
'j': 'Q',
'w': '>Q',
'n': '>',
'h': '<',
'q': '<Q',
'e': '<U',
'm': '<q',
'i': 'I',
'l': 'IU>',
's': '1',
'z': '1<',
'r': '1>',
'b': '1U',
'$': '1S',
'k': 'Q',
'2': '2',
'3': '3',
'4': '4',
'B': '2Q',
'C': '3Q',
'D': '4Q',
'p': '*',
'c': 'c*',
'7': '<<q',
'8': '<<',
'9': '<<Q',
't': 'p'
}


def typespec_elt_size(typespec):
if 'c' in typespec:
return 8
elif 's' in typespec or 'h' in typespec:
return 16
elif 'i' in typespec or 'f' in typespec:
return 32
elif 'l' in typespec or 'd' in typespec:
return 64
elif 'k' in typespec:
return 128

def get_resize(cur, desired):
res = ''
while cur < desired:
res += '>'
cur *= 2
while cur > desired:
res += '<'
cur /= 2
return res


def remap_protocol(proto, typespec, name):
key_type = 0

# Conversions like to see the integer type so they know signedness.
if 'vcvt' in name and '_f' in name and name != 'vcvt_f32_f64' and name != 'vcvt_f64_f32':
key_type = 1
default_width = typespec_elt_size(typespec)
inconsistent_width = False
for elt in typespec:
new_width = typespec_elt_size(elt)
if new_width and new_width != default_width:
inconsistent_width = True

res = ''
for i, c in enumerate(proto):
# void and pointers make for bad discriminators in CGBuiltin.cpp.
if c in 'vcp':
key_type += 1

if c in MOD_MAP:
cur_mod = MOD_MAP[c]
elif inconsistent_width:
# Otherwise it's a fixed output width modifier.
sys.stderr.write(f'warning: {name} uses fixed output size but has inconsistent input widths: {proto} {typespec}\n')

if c == 'Y':
# y: scalar of half float
resize = get_resize(default_width, 16)
cur_mod = f'1F{resize}'
elif c == 'y':
# y: scalar of float
resize = get_resize(default_width, 32)
cur_mod = f'1F{resize}'
elif c == 'o':
# o: scalar of double
resize = get_resize(default_width, 64)
cur_mod = f'1F{resize}'
elif c == 'I':
# I: scalar of 32-bit signed
resize = get_resize(default_width, 32)
cur_mod = f'1S{resize}'
elif c == 'L':
# L: scalar of 64-bit signed
resize = get_resize(default_width, 64)
cur_mod = f'1S{resize}'
elif c == 'U':
# I: scalar of 32-bit unsigned
resize = get_resize(default_width, 32)
cur_mod = f'1U{resize}'
elif c == 'O':
# O: scalar of 64-bit unsigned
resize = get_resize(default_width, 64)
cur_mod = f'1U{resize}'
elif c == 'f':
# f: float (int args)
resize = get_resize(default_width, 32)
cur_mod = f'F{resize}'
elif c == 'F':
# F: double (int args)
resize = get_resize(default_width, 64)
cur_mod = f'F{resize}'
elif c == 'H':
# H: half (int args)
resize = get_resize(default_width, 16)
cur_mod = f'F{resize}'
elif c == '0':
# 0: half (int args), ignore 'Q' size modifier.
resize = get_resize(default_width, 16)
cur_mod = f'Fq{resize}'
elif c == '1':
# 1: half (int args), force 'Q' size modifier.
resize = get_resize(default_width, 16)
cur_mod = f'FQ{resize}'

if len(cur_mod) == 0:
raise Exception(f'WTF: {c} in {name}')

if key_type != 0 and key_type == i:
cur_mod += '!'

if len(cur_mod) == 1:
res += cur_mod
else:
res += '(' + cur_mod + ')'

return res

def replace_insts(m):
start, end = m.span('proto')
start -= m.start()
end -= m.start()
new_proto = remap_protocol(m['proto'], m['kinds'], m['name'])
return m.group()[:start] + new_proto + m.group()[end:]

INST = re.compile(r'Inst<"(?P<name>.*?)",\s*"(?P<proto>.*?)",\s*"(?P<kinds>.*?)"')

new_td = INST.sub(replace_insts, sys.stdin.read())
sys.stdout.write(new_td)