-
Notifications
You must be signed in to change notification settings - Fork 10.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[CUDA/NVPTX] Improve handling of memcpy for -Os compilations.
We had some instances when LLVM would not inline fixed-count memcpy and ended up attempting to lower it a a libcall, which would not work on NVPTX as there's no standard library to call. The patch relaxes the threshold used for -Os compilation so we're always allowed to inline memory copy functions. Differential Revision: https://reviews.llvm.org/D158226
- Loading branch information
Showing
2 changed files
with
64 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ | ||
// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s | ||
// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ | ||
// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s | ||
#include "Inputs/cuda.h" | ||
|
||
// PTX-LABEL: .func _Z12copy_genericPvPKv( | ||
void __device__ copy_generic(void *dest, const void *src) { | ||
__builtin_memcpy(dest, src, 32); | ||
// PTX: ld.u8 | ||
// PTX: st.u8 | ||
} | ||
|
||
// PTX-LABEL: .entry _Z11copy_globalPvS_( | ||
void __global__ copy_global(void *dest, void * src) { | ||
__builtin_memcpy(dest, src, 32); | ||
// PTX: ld.global.u8 | ||
// PTX: st.global.u8 | ||
} | ||
|
||
struct S { | ||
int data[8]; | ||
}; | ||
|
||
// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( | ||
void __global__ copy_param_to_global(S *global, S param) { | ||
__builtin_memcpy(global, ¶m, sizeof(S)); | ||
// PTX: ld.param.u32 | ||
// PTX: st.global.u32 | ||
} | ||
|
||
// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( | ||
void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, | ||
S param) { | ||
__builtin_memcpy(local, ¶m, sizeof(S)); | ||
// PTX: ld.param.u32 | ||
// PTX: st.local.u32 | ||
} | ||
|
||
// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( | ||
void __device__ copy_local_to_generic(S *generic, | ||
__attribute__((address_space(5))) S *src) { | ||
__builtin_memcpy(generic, src, sizeof(S)); | ||
// PTX: ld.local.u32 | ||
// PTX: st.u32 | ||
} | ||
|
||
__shared__ S shared; | ||
|
||
// PTX-LABEL: .entry _Z20copy_param_to_shared1S( | ||
void __global__ copy_param_to_shared( S param) { | ||
__builtin_memcpy(&shared, ¶m, sizeof(S)); | ||
// PTX: ld.param.u32 | ||
// PTX: st.shared.u32 | ||
} | ||
|
||
void __device__ copy_shared_to_generic(S *generic) { | ||
__builtin_memcpy(generic, &shared, sizeof(S)); | ||
// PTX: ld.shared.u32 | ||
// PTX: st.u32 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters