Skip to content

Commit

Permalink
First attempt for building for Arm64 on MSVC (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
LoadLibrary committed May 11, 2020
1 parent 8ab5fd4 commit 19c5d8d
Show file tree
Hide file tree
Showing 4 changed files with 238 additions and 0 deletions.
@@ -0,0 +1,44 @@
From 19985a00f53fb5368c74d01f1dae5a9d67c8a4ae Mon Sep 17 00:00:00 2001
From: Augusto Righetto <aurighet@microsoft.com>
Date: Wed, 25 Mar 2020 23:26:53 -0700
Subject: [PATCH] Arm64 is a thing and has intrinsic to mul two 64bit uint

---
crypto/fipsmodule/bn/internal.h | 6 +++++-
include/openssl/base.h | 2 +-
2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index d58a2acce..7b4fba775 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -404,8 +404,12 @@ uint64_t bn_mont_n0(const BIGNUM *n);
int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n,
BN_CTX *ctx);

-#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
+#if defined(_MSC_VER)
+#if defined(OPENSSL_X86_64)
#define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
+#elif defined(OPENSSL_AARCH64)
+#define BN_UMULT_LOHI(low, high, a, b) ((low) = (a) * (b), high = __umulh((a), (b)))
+#endif
#endif

#if !defined(BN_ULLONG) && !defined(BN_UMULT_LOHI)
diff --git a/include/openssl/base.h b/include/openssl/base.h
index e347c09ae..0265c5103 100644
--- a/include/openssl/base.h
+++ b/include/openssl/base.h
@@ -90,7 +90,7 @@ extern "C" {
#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86)
#define OPENSSL_32_BIT
#define OPENSSL_X86
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(_M_ARM64)
#define OPENSSL_64_BIT
#define OPENSSL_AARCH64
#elif defined(__arm) || defined(__arm__) || defined(_M_ARM)
--
2.17.1.windows.2

@@ -0,0 +1,49 @@
From 39dc26711df8784a015db2501a85fe92726b3cc9 Mon Sep 17 00:00:00 2001
From: Augusto Righetto <aurighet@microsoft.com>
Date: Wed, 25 Mar 2020 23:24:15 -0700
Subject: [PATCH] Shift operator in Arm doesn't work the same as Intel

---
modules/audio_processing/aec3/fft_data.h | 3 +++
modules/audio_processing/legacy_noise_suppression.cc | 3 ++-
2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/audio_processing/aec3/fft_data.h b/modules/audio_processing/aec3/fft_data.h
index 5e5adb62de..7707249704 100644
--- a/modules/audio_processing/aec3/fft_data.h
+++ b/modules/audio_processing/aec3/fft_data.h
@@ -44,6 +44,8 @@ struct FftData {
void Spectrum(Aec3Optimization optimization,
rtc::ArrayView<float> power_spectrum) const {
RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
+#pragma warning(push)
+#pragma warning(disable:4065)
switch (optimization) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
case Aec3Optimization::kSse2: {
@@ -65,6 +67,7 @@ struct FftData {
std::transform(re.begin(), re.end(), im.begin(), power_spectrum.begin(),
[](float a, float b) { return a * a + b * b; });
}
+#pragma warning(pop)
}

// Copy the data from an interleaved array.
diff --git a/modules/audio_processing/legacy_noise_suppression.cc b/modules/audio_processing/legacy_noise_suppression.cc
index b2c88536ca..cc429397f8 100644
--- a/modules/audio_processing/legacy_noise_suppression.cc
+++ b/modules/audio_processing/legacy_noise_suppression.cc
@@ -151,8 +151,9 @@ std::vector<float> NoiseSuppression::NoiseEstimate() {
int q_noise;
const uint32_t* noise =
WebRtcNsx_noise_estimate(suppressor->state(), &q_noise);
+ const int32_t noise_factor = 1i32 << static_cast<int32_t>(q_noise);
const float kNormalizationFactor =
- 1.f / ((1 << q_noise) * suppressors_.size());
+ 1.f / (noise_factor * suppressors_.size());
for (size_t i = 0; i < noise_estimate.size(); ++i) {
noise_estimate[i] += kNormalizationFactor * noise[i];
}
--
2.17.1.windows.2

@@ -0,0 +1,139 @@
From 42d3d0b999c03d4250bb0cbc235c3b9ec5ac4b38 Mon Sep 17 00:00:00 2001
From: Augusto Righetto <aurighet@microsoft.com>
Date: Wed, 25 Mar 2020 23:29:10 -0700
Subject: [PATCH] cl aligns differently and hack for extracting first component
of a SIMD vector

---
simd/arm/common/jidctint-neon.c | 33 +++++++++++++++++++--------------
simd/arm/common/jidctred-neon.c | 25 ++++++++++++++++---------
2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c
index 7fb683b..038bdd2 100644
--- a/simd/arm/common/jidctint-neon.c
+++ b/simd/arm/common/jidctint-neon.c
@@ -75,15 +75,20 @@
#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)

-__attribute__ ((aligned(8))) static int16_t jsimd_idct_islow_neon_consts[] = {
- F_0_899, F_0_541,
- F_2_562, F_0_298_MINUS_0_899,
- F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
- F_0_541_PLUS_0_765, F_1_175,
- F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
- F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
- 0, 0, 0, 0
- };
+#if defined(_MSC_VER)
+__declspec(align(8))
+#else
+__attribute__ ((aligned(8)))
+#endif
+static int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+};

/* Forward declaration of regular and sparse IDCT helper functions. */

@@ -214,13 +219,13 @@ void jsimd_idct_islow_neon(void *dct_table,
int16x4_t bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
- int64_t bitmap_rows_4567 = vreinterpret_s64_s16(bitmap);
+ int64_t bitmap_rows_4567 = *((int64_t*)(&(vreinterpret_s64_s16(bitmap))));

if (bitmap_rows_4567 == 0) {
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
- int64_t left_ac_bitmap = vreinterpret_s64_s16(bitmap);
+ int64_t left_ac_bitmap = *((int64_t*)(&(vreinterpret_s64_s16(bitmap))));

if (left_ac_bitmap == 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
@@ -266,18 +271,18 @@ void jsimd_idct_islow_neon(void *dct_table,
bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
- bitmap_rows_4567 = vreinterpret_s64_s16(bitmap);
+ bitmap_rows_4567 = *((int64_t*)(&(vreinterpret_s64_s16(bitmap))));
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
- int64_t right_ac_bitmap = vreinterpret_s64_s16(bitmap);
+ int64_t right_ac_bitmap = *((int64_t*)(&(vreinterpret_s64_s16(bitmap))));

/* Initialise to non-zero value: defaults to regular second pass. */
int64_t right_ac_dc_bitmap = 1;

if (right_ac_bitmap == 0) {
bitmap = vorr_s16(bitmap, row0);
- right_ac_dc_bitmap = vreinterpret_s64_s16(bitmap);
+ right_ac_dc_bitmap = *((int64_t*)(&(vreinterpret_s64_s16(bitmap))));

if (right_ac_dc_bitmap != 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c
index aa10799..2de38b8 100644
--- a/simd/arm/common/jidctred-neon.c
+++ b/simd/arm/common/jidctred-neon.c
@@ -72,7 +72,7 @@

void jsimd_idct_2x2_neon(void *dct_table,
JCOEFPTR coef_block,
- JSAMPARRAY restrict output_buf,
+ JSAMPARRAY /* restrict */ output_buf,
JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
@@ -183,15 +183,20 @@ void jsimd_idct_2x2_neon(void *dct_table,
* exact compatibility with jpeg-6b.
*/

-__attribute__ ((aligned(8))) static int16_t jsimd_idct_4x4_neon_consts[] = {
- F_1_847, -F_0_765, -F_0_211, F_1_451,
- -F_2_172, F_1_061, -F_0_509, -F_0_601,
- F_0_899, F_2_562, 0, 0
- };
+#if defined(_MSC_VER)
+__declspec(align(8))
+#else
+__attribute__ ((aligned(8)))
+#endif
+static int16_t jsimd_idct_4x4_neon_consts[] = {
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+};

void jsimd_idct_4x4_neon(void *dct_table,
JCOEFPTR coef_block,
- JSAMPARRAY restrict output_buf,
+ JSAMPARRAY /* restrict */ output_buf,
JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
@@ -217,8 +222,10 @@ void jsimd_idct_4x4_neon(void *dct_table,
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);

- int64_t left_ac_bitmap = vreinterpret_s64_s16(vget_low_s16(bitmap));
- int64_t right_ac_bitmap = vreinterpret_s64_s16(vget_high_s16(bitmap));
+ int16x4_t low_s16 = vget_low_s16(bitmap);
+ int64_t left_ac_bitmap = *((int64_t*)(&(vreinterpret_s64_s16(low_s16))));
+ int16x4_t high_s16 = vget_high_s16(bitmap);
+ int64_t right_ac_bitmap = *((int64_t*)(&(vreinterpret_s64_s16(high_s16))));

/* Load constants for IDCT computation. */
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
--
2.17.1.windows.2

6 changes: 6 additions & 0 deletions patches_for_WebRTC_org/m80/patchWebRTCM80.cmd
Expand Up @@ -17,6 +17,11 @@ popd

pushd %WEBRTCM80_ROOT%\third_party\boringssl\src
git.exe am "%PATCH_DIR%0003-Replacing-deprecated-and-non-UWP-supported-RtlGenRan.patch"
git.exe am "%PATCH_DIR%6401-Arm64-is-a-thing-and-has-intrinsic-to-mul-two-64bit-.patch"
popd

pushd %WEBRTCM80_ROOT%\third_party\libjpeg_turbo
git.exe am "%PATCH_DIR%6401-cl-aligns-differently-and-hack-for-extracting-first-.patch"
popd

pushd %WEBRTCM80_ROOT%
Expand All @@ -25,4 +30,5 @@ git.exe am "%PATCH_DIR%0005-Fixing-UWP-build-for-time_utils.cc.patch"
git.exe am "%PATCH_DIR%0006-Fixing-UWP-build-for-file_rotating_stream.cc.patch"
git.exe am "%PATCH_DIR%0007-Fixing-UWP-build-for-modules-video_capture.patch"
git.exe am "%PATCH_DIR%0008-Fixing-UWP-build-for-modules-audio_device.patch"
git.exe am "%PATCH_DIR%6401-Shift-operator-in-Arm-doesn-t-work-the-same-as-Intel.patch"
popd

0 comments on commit 19c5d8d

Please sign in to comment.