[X86] Align other variants to use void * as 512 variants. (#66310)

FreddyLeaf · web-flow · commit 632d13ce84a8 · 2023-09-20T20:59:25.000+08:00
For *_stream_* series intrinsics
diff --git a/clang/lib/Headers/ammintrin.h b/clang/lib/Headers/ammintrin.h
@@ -155,9 +155,9 @@ _mm_insert_si64(__m128i __x, __m128i __y)
 /// \param __a
 ///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(double *__p, __m128d __a)
+_mm_stream_sd(void *__p, __m128d __a)
 {
-  __builtin_ia32_movntsd(__p, (__v2df)__a);
+  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
 }
 
 /// Stores a 32-bit single-precision floating-point value in a 32-bit
@@ -173,9 +173,9 @@ _mm_stream_sd(double *__p, __m128d __a)
 /// \param __a
 ///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(float *__p, __m128 __a)
+_mm_stream_ss(void *__p, __m128 __a)
 {
-  __builtin_ia32_movntss(__p, (__v4sf)__a);
+  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
@@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
+_mm256_stream_load_si256(const void *__V)
 {
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
@@ -3563,7 +3563,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 /// \param __b
 ///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
+_mm256_stream_si256(void *__a, __m256i __b)
 {
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
@@ -3583,7 +3583,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
+_mm256_stream_pd(void *__a, __m256d __b)
 {
   typedef __v4df __v4df_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
@@ -3604,7 +3604,7 @@ _mm256_stream_pd(double *__a, __m256d __b)
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
+_mm256_stream_ps(void *__p, __m256 __a)
 {
   typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
@@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
                                                         __m128d __a) {
   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
 }
@@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
                                                            __m128i __a) {
   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
 }
@@ -3983,8 +3983,8 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
 ///    A 32-bit integer containing the value to be stored.
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si32(int *__p, int __a) {
-  __builtin_ia32_movnti(__p, __a);
+    _mm_stream_si32(void *__p, int __a) {
+  __builtin_ia32_movnti((int *)__p, __a);
 }
 
 #ifdef __x86_64__
@@ -4003,8 +4003,8 @@ static __inline__ void
 ///    A 64-bit integer containing the value to be stored.
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si64(long long *__p, long long __a) {
-  __builtin_ia32_movnti64(__p, __a);
+    _mm_stream_si64(void *__p, long long __a) {
+  __builtin_ia32_movnti64((long long *)__p, __a);
 }
 #endif
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
@@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
 /// \returns A 128-bit integer vector containing the data stored at the
 ///    specified memory location.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(__m128i const *__V) {
+_mm_stream_load_si128(const void *__V) {
   return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
 }
 
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
@@ -2121,9 +2121,9 @@ _mm_storer_ps(float *__p, __m128 __a)
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS_MMX
-_mm_stream_pi(__m64 *__p, __m64 __a)
+_mm_stream_pi(void *__p, __m64 __a)
 {
-  __builtin_ia32_movntq(__p, __a);
+  __builtin_ia32_movntq((__m64 *)__p, __a);
 }
 
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2140,7 +2140,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ps(float *__p, __m128 __a)
+_mm_stream_ps(void *__p, __m128 __a)
 {
   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1940,18 +1940,36 @@ void test_mm256_stream_pd(double* A, __m256d B) {
   _mm256_stream_pd(A, B);
 }
 
+void test_mm256_stream_pd_void(void *A, __m256d B) {
+  // CHECK-LABEL: test_mm256_stream_pd_void
+  // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_pd(A, B);
+}
+
 void test_mm256_stream_ps(float* A, __m256 B) {
   // CHECK-LABEL: test_mm256_stream_ps
   // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_ps(A, B);
 }
 
+void test_mm256_stream_ps_void(void *A, __m256 B) {
+  // CHECK-LABEL: test_mm256_stream_ps_void
+  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_ps(A, B);
+}
+
 void test_mm256_stream_si256(__m256i* A, __m256i B) {
   // CHECK-LABEL: test_mm256_stream_si256
   // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_si256(A, B);
 }
 
+void test_mm256_stream_si256_void(void *A, __m256i B) {
+  // CHECK-LABEL: test_mm256_stream_si256_void
+  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_si256(A, B);
+}
+
 __m256d test_mm256_sub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_sub_pd
   // CHECK: fsub <4 x double>
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1223,6 +1223,12 @@ __m256i test_mm256_stream_load_si256(__m256i const *a) {
   return _mm256_stream_load_si256(a);
 }
 
+__m256i test_mm256_stream_load_si256_void(const void *a) {
+  // CHECK-LABEL: test_mm256_stream_load_si256_void
+  // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
+  return _mm256_stream_load_si256(a);
+}
+
 __m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sub_epi8
   // CHECK: sub <32 x i8>
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -601,6 +601,12 @@ void test_mm_stream_pi(__m64 *p, __m64 a) {
   _mm_stream_pi(p, a);
 }
 
+void test_mm_stream_pi_void(void *p, __m64 a) {
+  // CHECK-LABEL: test_mm_stream_pi_void
+  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  _mm_stream_pi(p, a);
+}
+
 __m64 test_mm_sub_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi8
   // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
@@ -720,6 +720,12 @@ void test_mm_stream_ps(float*A, __m128 B) {
   _mm_stream_ps(A, B);
 }
 
+void test_mm_stream_ps_void(void *A, __m128 B) {
+  // CHECK-LABEL: test_mm_stream_ps_void
+  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_ps(A, B);
+}
+
 __m128 test_mm_sub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_sub_ps
   // CHECK: fsub <4 x float>
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1488,18 +1488,36 @@ void test_mm_stream_pd(double *A, __m128d B) {
   _mm_stream_pd(A, B);
 }
 
+void test_mm_stream_pd_void(void *A, __m128d B) {
+  // CHECK-LABEL: test_mm_stream_pd_void
+  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_pd(A, B);
+}
+
 void test_mm_stream_si32(int *A, int B) {
   // CHECK-LABEL: test_mm_stream_si32
   // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si32(A, B);
 }
 
+void test_mm_stream_si32_void(void *A, int B) {
+  // CHECK-LABEL: test_mm_stream_si32_void
+  // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+  _mm_stream_si32(A, B);
+}
+
 #ifdef __x86_64__
 void test_mm_stream_si64(long long *A, long long B) {
   // X64-LABEL: test_mm_stream_si64
   // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si64(A, B);
 }
+
+void test_mm_stream_si64_void(void *A, long long B) {
+  // X64-LABEL: test_mm_stream_si64_void
+  // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+  _mm_stream_si64(A, B);
+}
 #endif
 
 void test_mm_stream_si128(__m128i *A, __m128i B) {
@@ -1508,6 +1526,12 @@ void test_mm_stream_si128(__m128i *A, __m128i B) {
   _mm_stream_si128(A, B);
 }
 
+void test_mm_stream_si128_void(void *A, __m128i B) {
+  // CHECK-LABEL: test_mm_stream_si128_void
+  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_si128(A, B);
+}
+
 __m128i test_mm_sub_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sub_epi8
   // CHECK: sub <16 x i8>
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -358,6 +358,12 @@ __m128i test_mm_stream_load_si128(__m128i const *a) {
   return _mm_stream_load_si128(a);
 }
 
+__m128i test_mm_stream_load_si128_void(const void *a) {
+  // CHECK-LABEL: test_mm_stream_load_si128_void
+  // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
+  return _mm_stream_load_si128(a);
+}
+
 int test_mm_test_all_ones(__m128i x) {
   // CHECK-LABEL: test_mm_test_all_ones
   // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
diff --git a/clang/test/CodeGen/X86/sse4a-builtins.c b/clang/test/CodeGen/X86/sse4a-builtins.c
@@ -37,9 +37,23 @@ void test_mm_stream_sd(double *p, __m128d a) {
    _mm_stream_sd(p, a);
 }
 
+void test_mm_stream_sd_void(void *p, __m128d a) {
+  // CHECK-LABEL: test_mm_stream_sd_void
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: store double %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+   _mm_stream_sd(p, a);
+}
+
 void test_mm_stream_ss(float *p, __m128 a) {
   // CHECK-LABEL: test_mm_stream_ss
   // CHECK: extractelement <4 x float> %{{.*}}, i64 0
   // CHECK: store float %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_ss(p, a);
 }
+
+void test_mm_stream_s_void(void *p, __m128 a) {
+  // CHECK-LABEL: test_mm_stream_s_void
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: store float %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+  _mm_stream_ss(p, a);
+}

Original file line number	Diff line number	Diff line change
`@@ -155,9 +155,9 @@ _mm_insert_si64(__m128i __x, __m128i __y)`
`155`	`155`	`/// \param __a`
`156`	`156`	`/// The 64-bit double-precision floating-point register value to be stored.`
`157`	`157`	`static __inline__ void __DEFAULT_FN_ATTRS`
`158`		`-_mm_stream_sd(double *__p, __m128d __a)`
	`158`	`+_mm_stream_sd(void *__p, __m128d __a)`
`159`	`159`	`{`
`160`		`- __builtin_ia32_movntsd(__p, (__v2df)__a);`
	`160`	`+ __builtin_ia32_movntsd((double *)__p, (__v2df)__a);`
`161`	`161`	`}`
`162`	`162`
`163`	`163`	`/// Stores a 32-bit single-precision floating-point value in a 32-bit`
`@@ -173,9 +173,9 @@ _mm_stream_sd(double *__p, __m128d __a)`
`173`	`173`	`/// \param __a`
`174`	`174`	`/// The 32-bit single-precision floating-point register value to be stored.`
`175`	`175`	`static __inline__ void __DEFAULT_FN_ATTRS`
`176`		`-_mm_stream_ss(float *__p, __m128 __a)`
	`176`	`+_mm_stream_ss(void *__p, __m128 __a)`
`177`	`177`	`{`
`178`		`- __builtin_ia32_movntss(__p, (__v4sf)__a);`
	`178`	`+ __builtin_ia32_movntss((float *)__p, (__v4sf)__a);`
`179`	`179`	`}`
`180`	`180`
`181`	`181`	`#undef __DEFAULT_FN_ATTRS`
Original file line number	Diff line number	Diff line change
`@@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)`
`2979`	`2979`	`/// A pointer to the 32-byte aligned memory containing the vector to load.`
`2980`	`2980`	`/// \returns A 256-bit integer vector loaded from memory.`
`2981`	`2981`	`static __inline__ __m256i __DEFAULT_FN_ATTRS256`
`2982`		`-_mm256_stream_load_si256(__m256i const *__V)`
	`2982`	`+_mm256_stream_load_si256(const void *__V)`
`2983`	`2983`	`{`
`2984`	`2984`	`typedef __v4di __v4di_aligned __attribute__((aligned(32)));`
`2985`	`2985`	`return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);`
Original file line number	Diff line number	Diff line change
`@@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,`
`645`	`645`	`/// \returns A 128-bit integer vector containing the data stored at the`
`646`	`646`	`/// specified memory location.`
`647`	`647`	`static __inline__ __m128i __DEFAULT_FN_ATTRS`
`648`		`-_mm_stream_load_si128(__m128i const *__V) {`
	`648`	`+_mm_stream_load_si128(const void *__V) {`
`649`	`649`	`return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);`
`650`	`650`	`}`
`651`	`651`
Original file line number	Diff line number	Diff line change
`@@ -2121,9 +2121,9 @@ _mm_storer_ps(float *__p, __m128 __a)`
`2121`	`2121`	`/// \param __a`
`2122`	`2122`	`/// A 64-bit integer containing the value to be stored.`
`2123`	`2123`	`static __inline__ void __DEFAULT_FN_ATTRS_MMX`
`2124`		`-_mm_stream_pi(__m64 *__p, __m64 __a)`
	`2124`	`+_mm_stream_pi(void *__p, __m64 __a)`
`2125`	`2125`	`{`
`2126`		`- __builtin_ia32_movntq(__p, __a);`
	`2126`	`+ __builtin_ia32_movntq((__m64 *)__p, __a);`
`2127`	`2127`	`}`
`2128`	`2128`
`2129`	`2129`	`/// Moves packed float values from a 128-bit vector of [4 x float] to a`
`@@ -2140,7 +2140,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)`
`2140`	`2140`	`/// \param __a`
`2141`	`2141`	`/// A 128-bit vector of [4 x float] containing the values to be moved.`
`2142`	`2142`	`static __inline__ void __DEFAULT_FN_ATTRS`
`2143`		`-_mm_stream_ps(float *__p, __m128 __a)`
	`2143`	`+_mm_stream_ps(void *__p, __m128 __a)`
`2144`	`2144`	`{`
`2145`	`2145`	`__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);`
`2146`	`2146`	`}`