From e6d8e4b234ac91493c32f886ae96ddab626a0977 Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Wed, 10 Sep 2025 17:11:10 +0800
Subject: [PATCH] [clang][LoongArch] Introduce LASX and LSX conversion
 intrinsics

This patch introduces the LASX and LSX conversion intrinsics:

- __m256 __lasx_cast_128_s (__m128)
- __m256d __lasx_cast_128_d (__m128d)
- __m256i __lasx_cast_128 (__m128i)
- __m256 __lasx_concat_128_s (__m128, __m128)
- __m256d __lasx_concat_128_d (__m128, __m128d)
- __m256i __lasx_concat_128 (__m128, __m128i)
- __m128 __lasx_extract_128_lo_s (__m256)
- __m128d __lasx_extract_128_lo_d (__m256d)
- __m128i __lasx_extract_128_lo (__m256i)
- __m128 __lasx_extract_128_hi_s (__m256)
- __m128d __lasx_extract_128_hi_d (__m256d)
- __m128i __lasx_extract_128_hi (__m256i)
- __m256 __lasx_insert_128_lo_s (__m256, __m128)
- __m256d __lasx_insert_128_lo_d (__m256d, __m128d)
- __m256i __lasx_insert_128_lo (__m256i, __m128i)
- __m256 __lasx_insert_128_hi_s (__m256, __m128)
- __m256d __lasx_insert_128_hi_d (__m256d, __m128d)
- __m256i __lasx_insert_128_hi (__m256i, __m128i)
---
 .../clang/Basic/BuiltinsLoongArchLASX.def     |  19 ++
 clang/lib/Basic/Targets/LoongArch.cpp         |   1 +
 clang/lib/Headers/lasxintrin.h                | 113 +++++++++++
 .../CodeGen/LoongArch/lasx/builtin-alias.c    | 171 +++++++++++++++++
 clang/test/CodeGen/LoongArch/lasx/builtin.c   | 175 ++++++++++++++++++
 clang/test/Preprocessor/init-loongarch.c      |   3 +
 6 files changed, 482 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index c4ea46a3bc5b5..a5eee613d5c9e 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -986,3 +986,22 @@ TARGET_BUILTIN(__builtin_lasx_xbnz_b, "iV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_h, "iV16Us", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_w, "iV8Ui", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_d, "iV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_cast_128_s, "V8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128_d, "V4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128, "V4LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_s, "V8fV4fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_d, "V4dV2dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128, "V4LLiV2LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo, "V2LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi, "V2LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo, "V4LLiV4LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi, "V4LLiV4LLiV2LLi", "nc", "lasx")
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 8e29bb745734b..5863af3f3b920 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -242,6 +242,7 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_simd_width", "256");
     Builder.defineMacro("__loongarch_sx", Twine(1));
     Builder.defineMacro("__loongarch_asx", Twine(1));
+    Builder.defineMacro("__loongarch_asx_sx_conv", Twine(1));
   } else if (HasFeatureLSX) {
     Builder.defineMacro("__loongarch_simd_width", "128");
     Builder.defineMacro("__loongarch_sx", Twine(1));
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index 85020d82829e2..83cc4288a990c 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -10,6 +10,8 @@
 #ifndef _LOONGSON_ASXINTRIN_H
 #define _LOONGSON_ASXINTRIN_H 1
 
+#include <lsxintrin.h>
+
 #if defined(__loongarch_asx)
 
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
@@ -3882,5 +3884,116 @@ extern __inline
 
 #define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1)))
 
+#if defined(__loongarch_asx_sx_conv)
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__,
+                   __artificial__)) __m256 __lasx_cast_128_s(__m128 _1) {
+  return (__m256)__builtin_lasx_cast_128_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_cast_128_d(__m128d _1) {
+  return (__m256d)__builtin_lasx_cast_128_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_cast_128(__m128i _1) {
+  return (__m256i)__builtin_lasx_cast_128((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_concat_128_s(__m128 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_concat_128_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_concat_128_d(__m128d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_concat_128_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_concat_128(__m128i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_concat_128((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lasx_extract_128_lo_s(__m256 _1) {
+  return (__m128)__builtin_lasx_extract_128_lo_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lasx_extract_128_lo_d(__m256d _1) {
+  return (__m128d)__builtin_lasx_extract_128_lo_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lasx_extract_128_lo(__m256i _1) {
+  return (__m128i)__builtin_lasx_extract_128_lo((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lasx_extract_128_hi_s(__m256 _1) {
+  return (__m128)__builtin_lasx_extract_128_hi_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lasx_extract_128_hi_d(__m256d _1) {
+  return (__m128d)__builtin_lasx_extract_128_hi_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lasx_extract_128_hi(__m256i _1) {
+  return (__m128i)__builtin_lasx_extract_128_hi((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_insert_128_lo_s(__m256 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_insert_128_lo_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_insert_128_lo_d(__m256d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_insert_128_lo_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_insert_128_lo(__m256i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_insert_128_lo((v4i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_insert_128_hi_s(__m256 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_insert_128_hi_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_insert_128_hi_d(__m256d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_insert_128_hi_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_insert_128_hi(__m256i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_insert_128_hi((v4i64)_1, (v2i64)_2);
+}
+
+#endif /* defined(__loongarch_asx_sx_conv).  */
 #endif /* defined(__loongarch_asx).  */
 #endif /* _LOONGSON_ASXINTRIN_H.  */
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index 03a746c966cdd..4f289ca84c271 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -7120,6 +7120,177 @@ v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
+// CHECK-LABEL: define dso_local void @cast_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 cast_128_s(v4f32 _1) { return __lasx_cast_128_s(_1); }
+// CHECK-LABEL: define dso_local void @cast_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 cast_128_d(v2f64 _1) { return __lasx_cast_128_d(_1); }
+// CHECK-LABEL: define dso_local void @cast_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 cast_128(v2i64 _1) { return __lasx_cast_128(_1); }
+// CHECK-LABEL: define dso_local void @concat_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __lasx_concat_128_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __lasx_concat_128_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 concat_128(v2i64 _1, v2i64 _2) { return __lasx_concat_128(_1, _2); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_lo_s(v8f32 _1) { return __lasx_extract_128_lo_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_lo_d(v4f64 _1) { return __lasx_extract_128_lo_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_lo(v4i64 _1) { return __lasx_extract_128_lo(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_hi_s(v8f32 _1) { return __lasx_extract_128_hi_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_hi_d(v4f64 _1) { return __lasx_extract_128_hi_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_hi(v4i64 _1) { return __lasx_extract_128_hi(_1); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_lo_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_lo_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __lasx_insert_128_lo(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_hi_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_hi_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __lasx_insert_128_hi(_1, _2); }
 //.
 // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index 700e845cd662a..373efefc9b264 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -1,6 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
 
+typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef double v2f64 __attribute__((vector_size(16), aligned(16)));
+
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
 typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
 typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32)));
@@ -7142,6 +7146,177 @@ v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
+// CHECK-LABEL: define dso_local void @cast_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 cast_128_s(v4f32 _1) { return __builtin_lasx_cast_128_s(_1); }
+// CHECK-LABEL: define dso_local void @cast_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 cast_128_d(v2f64 _1) { return __builtin_lasx_cast_128_d(_1); }
+// CHECK-LABEL: define dso_local void @cast_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 cast_128(v2i64 _1) { return __builtin_lasx_cast_128(_1); }
+// CHECK-LABEL: define dso_local void @concat_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __builtin_lasx_concat_128_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __builtin_lasx_concat_128_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 concat_128(v2i64 _1, v2i64 _2) { return __builtin_lasx_concat_128(_1, _2); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_lo_s(v8f32 _1) { return __builtin_lasx_extract_128_lo_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_lo_d(v4f64 _1) { return __builtin_lasx_extract_128_lo_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_lo(v4i64 _1) { return __builtin_lasx_extract_128_lo(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_hi_s(v8f32 _1) { return __builtin_lasx_extract_128_hi_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_hi_d(v4f64 _1) { return __builtin_lasx_extract_128_hi_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_hi(v4i64 _1) { return __builtin_lasx_extract_128_hi(_1); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_lo_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_lo_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_lo(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_hi_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_hi_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_hi(_1, _2); }
 //.
 // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index fd7ce2073a512..b2c0b51464a80 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -925,6 +925,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
+// MLSX-NOT: #define __loongarch_asx_sx_conv
 // MLSX: #define __loongarch_simd_width 128
 // MLSX: #define __loongarch_sx 1
 
@@ -937,6 +938,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
+// MLASX: #define __loongarch_asx_sx_conv 1
 // MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
 
@@ -953,5 +955,6 @@
 // RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
+// MNO-LSX-NOT: #define __loongarch_asx_sx_conv
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx