Skip to content

Commit

Permalink
Workaround arm64 gcc error in std::copysign (pytorch#51900)
Browse files Browse the repository at this point in the history
Summary:
Move definition of copysign template and specialization for
bfloat16/half types before first use of copysign in that file

Add comment explaining why this is necessary

Fixes pytorch#51889

Pull Request resolved: pytorch#51900

Reviewed By: walterddr

Differential Revision: D26321741

Pulled By: malfet

fbshipit-source-id: 888858b11d9708fa140fe9c0570cc5a24599205b
  • Loading branch information
malfet committed Feb 10, 2021
1 parent 9e5bcc1 commit 4201e12
Showing 1 changed file with 22 additions and 18 deletions.
40 changes: 22 additions & 18 deletions aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,27 @@ namespace {

using namespace vec256;

// Note: Explicit implementation of copysign for Half and BFloat16
// is needed to workaround g++-7/8 crash on aarch64, but also makes
// copysign faster for the half-precision types
template<typename T>
T copysign(T a, T b) {
return std::copysign(a, b);
}

// Implement copysign for half precision floats using bit ops
// Sign is the most significant bit for both half and bfloat16 types
template<>
c10::Half copysign(c10::Half a, c10::Half b) {
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
}

template<>
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
}


// Note: Undefined behavior when performing addition is intentionally
// ignored.
void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
Expand Down Expand Up @@ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) {
floordiv += scalar_t(1.0);
}
} else {
floordiv = std::copysign(scalar_t(0), a / b);
floordiv = copysign(scalar_t(0), a / b);
}
return floordiv;
});
Expand Down Expand Up @@ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) {
});
}

template<typename T>
T copysign(T a, T b) {
return std::copysign(a, b);
}

// Implement copysign for half precision floats using bit ops
// Sign is the most significant bit for both half and bfloat16 types
template<>
c10::Half copysign(c10::Half a, c10::Half b) {
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
}

template<>
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
}

void copysign_kernel(TensorIterator& iter) {
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
Expand Down

0 comments on commit 4201e12

Please sign in to comment.