diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll index 05a0e39d4a715..72814a117ddc3 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca,vector-combine < %s | FileCheck -check-prefix=OPT %s define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { ; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users( ; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; OPT-NEXT: [[ENTRY:.*:]] ; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison -; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> ; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 ; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 ; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 @@ -38,262 +38,246 @@ define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x ; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 ; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 ; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 -; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0 +; OPT-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> ; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17 ; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18 ; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19 ; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20 ; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21 ; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22 ; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23 ; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24 ; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25 ; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26 ; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27 ; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28 ; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29 ; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30 ; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15 -; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31 +; OPT-NEXT: [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> ; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33 ; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34 ; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35 ; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36 ; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37 ; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38 ; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39 ; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40 ; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41 ; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42 ; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43 ; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44 ; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45 ; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46 ; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15 -; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47 +; OPT-NEXT: [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> ; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49 ; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50 ; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51 ; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52 ; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53 ; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54 ; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55 ; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56 ; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57 ; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58 ; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59 ; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60 ; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61 ; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62 ; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15 -; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63 +; OPT-NEXT: [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> ; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65 ; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66 ; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67 ; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68 ; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69 ; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70 ; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71 ; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72 ; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73 ; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74 ; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75 ; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76 ; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77 ; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78 ; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15 -; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79 +; OPT-NEXT: [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> ; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81 ; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82 ; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83 ; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84 ; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85 ; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86 ; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87 ; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88 ; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89 ; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90 ; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91 ; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92 ; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93 ; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94 ; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15 -; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95 +; OPT-NEXT: [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> ; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97 ; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98 ; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99 ; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100 ; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101 ; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102 ; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103 ; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104 ; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105 ; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106 ; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107 ; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108 ; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109 ; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110 ; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15 -; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111 +; OPT-NEXT: [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> +; OPT-NEXT: [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> ; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113 ; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114 ; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115 ; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116 ; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117 ; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118 ; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119 ; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120 ; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121 ; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122 ; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123 ; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124 ; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125 ; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126 ; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15 -; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80 -; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0 -; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81 -; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1 -; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82 -; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2 -; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83 -; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3 -; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84 -; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4 -; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85 -; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5 -; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86 -; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6 -; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87 -; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7 -; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88 -; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8 -; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89 -; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9 -; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90 -; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10 -; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91 -; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11 -; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92 -; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12 -; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93 -; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13 -; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94 -; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14 -; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95 -; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127 +; OPT-NEXT: [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1 +; OPT-NEXT: [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3 +; OPT-NEXT: [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5 +; OPT-NEXT: [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7 +; OPT-NEXT: [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9 +; OPT-NEXT: [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11 +; OPT-NEXT: [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13 +; OPT-NEXT: [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14 +; OPT-NEXT: [[TMP287:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> ; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]] ; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 ; OPT-NEXT: ret void @@ -303,19 +287,19 @@ entry: %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0 store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep1, align 16 %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep2, align 16 %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep3, align 16 %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep4, align 16 %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep5, align 16 %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep6, align 16 %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + store <16 x i8> %in, ptr addrspace(5) %gep7, align 16 %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16 %sum = add <16 x i8> %load, %add store <16 x i8> %sum, ptr addrspace(3) %out, align 16