-
Notifications
You must be signed in to change notification settings - Fork 68
Address some TODONVDEC #960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
756060c
aa382fc
4723ecd
968ecb8
37c0b0d
633c4b3
85d58fb
a24501c
18c8080
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -216,12 +216,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { | |
| // unclear. | ||
| flush(); | ||
| unmapPreviousFrame(); | ||
| NVDECCache::getCache(device_.index()) | ||
| .returnDecoder(&videoFormat_, std::move(decoder_)); | ||
| NVDECCache::getCache(device_).returnDecoder( | ||
| &videoFormat_, std::move(decoder_)); | ||
| } | ||
|
|
||
| if (videoParser_) { | ||
| // TODONVDEC P2: consider caching this? Does DALI do that? | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Answer: No, DALI doesn't cache the parser. |
||
| cuvidDestroyVideoParser(videoParser_); | ||
| videoParser_ = nullptr; | ||
| } | ||
|
|
@@ -362,11 +361,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { | |
| } | ||
|
|
||
| if (!decoder_) { | ||
| decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat); | ||
| decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat); | ||
|
|
||
| if (!decoder_) { | ||
| // TODONVDEC P2: consider re-configuring an existing decoder instead of | ||
| // re-creating one. See docs, see DALI. | ||
| // re-creating one. See docs, see DALI. Re-configuration doesn't seem to | ||
| // be enabled in DALI by default. | ||
| decoder_ = createDecoder(videoFormat); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,14 +5,12 @@ | |
| // LICENSE file in the root directory of this source tree. | ||
|
|
||
| #include "src/torchcodec/_core/CUDACommon.h" | ||
| #include "src/torchcodec/_core/Cache.h" // for PerGpuCache | ||
|
|
||
| namespace facebook::torchcodec { | ||
|
|
||
| namespace { | ||
|
|
||
| // Pytorch can only handle up to 128 GPUs. | ||
| // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44 | ||
| const int MAX_CUDA_GPUS = 128; | ||
| // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching. | ||
| // Set to a positive number to have a cache of that size. | ||
| const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1; | ||
|
|
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB( | |
| } | ||
|
|
||
| UniqueNppContext getNppStreamContext(const torch::Device& device) { | ||
| torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device); | ||
| int deviceIndex = getDeviceIndex(device); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Going from |
||
|
|
||
| UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device); | ||
| if (nppCtx) { | ||
|
|
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) { | |
|
|
||
| nppCtx = std::make_unique<NppStreamContext>(); | ||
| cudaDeviceProp prop{}; | ||
| cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex); | ||
| cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex); | ||
| TORCH_CHECK( | ||
| err == cudaSuccess, | ||
| "cudaGetDeviceProperties failed: ", | ||
| cudaGetErrorString(err)); | ||
|
|
||
| nppCtx->nCudaDeviceId = nonNegativeDeviceIndex; | ||
| nppCtx->nCudaDeviceId = deviceIndex; | ||
| nppCtx->nMultiProcessorCount = prop.multiProcessorCount; | ||
| nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; | ||
| nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock; | ||
|
|
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape( | |
| } | ||
| } | ||
|
|
||
| int getDeviceIndex(const torch::Device& device) { | ||
| // PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA | ||
| // libraries use int. So we use int, too. | ||
| int deviceIndex = static_cast<int>(device.index()); | ||
| TORCH_CHECK( | ||
| deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS, | ||
| "Invalid device index = ", | ||
| deviceIndex); | ||
|
|
||
| if (deviceIndex == -1) { | ||
| TORCH_CHECK( | ||
| cudaGetDevice(&deviceIndex) == cudaSuccess, | ||
| "Failed to get current CUDA device."); | ||
| } | ||
| return deviceIndex; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another fix: |
||
| } | ||
|
|
||
| } // namespace facebook::torchcodec | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,30 +95,16 @@ class PerGpuCache { | |
| std::vector<std::unique_ptr<Cache<T, D>>> cache_; | ||
| }; | ||
|
|
||
| // Note: this function is inline for convenience, not performance. Because the | ||
| // rest of this file is template functions, they must all be defined in this | ||
| // header. This function is not a template function, and should, in principle, | ||
| // be defined in a .cpp file to preserve the One Definition Rule. That's | ||
| // annoying for such a small amount of code, so we just inline it. If this file | ||
| // grows, and there are more such functions, we should break them out into a | ||
| // .cpp file. | ||
| inline torch::DeviceIndex getNonNegativeDeviceIndex( | ||
| const torch::Device& device) { | ||
| torch::DeviceIndex deviceIndex = device.index(); | ||
| // For single GPU machines libtorch returns -1 for the device index. So for | ||
| // that case we set the device index to 0. That's used in per-gpu cache | ||
| // implementation and during initialization of CUDA and FFmpeg contexts | ||
| // which require non negative indices. | ||
| deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: this was wrong! -1 should be mapped to the current device, not to 0. |
||
| TORCH_CHECK(deviceIndex >= 0, "Device index out of range"); | ||
| return deviceIndex; | ||
| } | ||
| // Forward declaration of getDeviceIndex which exists in CUDACommon.h | ||
| // This avoids circular dependency between Cache.h and CUDACommon.cpp which also | ||
| // needs to include Cache.h | ||
| int getDeviceIndex(const torch::Device& device); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not thrilled by this, happy to consider alternatives. We could:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Forward declarations of templates are possible, but awkward. But if
I prefer 2, but that might be a bigger refactor than this PR. So I'm good with this for now, and we can file a task for option 2, if you also agree it makes sense.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think 2 is reasonable, we could align both caches to just take an index as input. Let me follow-up on that |
||
|
|
||
| template <typename T, typename D> | ||
| bool PerGpuCache<T, D>::addIfCacheHasCapacity( | ||
| const torch::Device& device, | ||
| element_type&& obj) { | ||
| torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device); | ||
| int deviceIndex = getDeviceIndex(device); | ||
| TORCH_CHECK( | ||
| static_cast<size_t>(deviceIndex) < cache_.size(), | ||
| "Device index out of range"); | ||
|
|
@@ -128,7 +114,7 @@ bool PerGpuCache<T, D>::addIfCacheHasCapacity( | |
| template <typename T, typename D> | ||
| typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get( | ||
| const torch::Device& device) { | ||
| torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device); | ||
| int deviceIndex = getDeviceIndex(device); | ||
| TORCH_CHECK( | ||
| static_cast<size_t>(deviceIndex) < cache_.size(), | ||
| "Device index out of range"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,9 +32,6 @@ static bool g_cuda = registerDeviceInterface( | |
| // from | ||
| // the cache. If the cache is empty we create a new cuda context. | ||
|
|
||
| // Pytorch can only handle up to 128 GPUs. | ||
| // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44 | ||
| const int MAX_CUDA_GPUS = 128; | ||
| // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching. | ||
| // Set to a positive number to have a cache of that size. | ||
| const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1; | ||
|
|
@@ -54,7 +51,7 @@ int getFlagsAVHardwareDeviceContextCreate() { | |
| UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) { | ||
| enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda"); | ||
| TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device"); | ||
| torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device); | ||
| int deviceIndex = getDeviceIndex(device); | ||
|
|
||
| UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device); | ||
| if (hardwareDeviceCtx) { | ||
|
|
@@ -68,9 +65,9 @@ UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) { | |
| // So we ensure the deviceIndex is not negative. | ||
| // We set the device because we may be called from a different thread than | ||
| // the one that initialized the cuda context. | ||
| cudaSetDevice(nonNegativeDeviceIndex); | ||
| cudaSetDevice(deviceIndex); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here as well: |
||
| AVBufferRef* hardwareDeviceCtxRaw = nullptr; | ||
| std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex); | ||
| std::string deviceOrdinal = std::to_string(deviceIndex); | ||
|
|
||
| int err = av_hwdevice_ctx_create( | ||
| &hardwareDeviceCtxRaw, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| #include <mutex> | ||
|
|
||
| #include <cuda.h> | ||
| #include <torch/types.h> | ||
| #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h" | ||
| #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h" | ||
|
|
||
|
|
@@ -36,7 +37,7 @@ using UniqueCUvideodecoder = | |
| // per GPU device, and it is accessed through the static getCache() method. | ||
| class NVDECCache { | ||
| public: | ||
| static NVDECCache& getCache(int deviceIndex); | ||
| static NVDECCache& getCache(const torch::Device& device); | ||
|
|
||
| // Get decoder from cache - returns nullptr if none available | ||
| UniqueCUvideodecoder getDecoder(CUVIDEOFORMAT* videoFormat); | ||
|
|
@@ -68,11 +69,6 @@ class NVDECCache { | |
| CacheKey(const CacheKey&) = default; | ||
| CacheKey& operator=(const CacheKey&) = default; | ||
|
|
||
| // TODONVDEC P2: we only implement operator< which is enough for std::map, | ||
| // but: | ||
| // - we should consider using std::unordered_map | ||
| // - we should consider a more sophisticated and potentially less strict | ||
| // cache key comparison logic | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realized that using an On the potentailly less strict cache logic: I looked into more details at DALI's implementation and they do the same thing, modulo decoder re-configuration, which is tracked in another TODO. So we can remove this one. |
||
| bool operator<(const CacheKey& other) const { | ||
| return std::tie( | ||
| codecType, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,9 +41,6 @@ def unsplit_device_str(device_str: str) -> str: | |
| # It is used: | ||
| # - before calling `.to(device)` where device can't be "cuda:0:beta" | ||
| # - before calling add_video_stream(device=device, device_variant=device_variant) | ||
| # | ||
| # TODONVDEC P2: Find a less clunky way to test the BETA CUDA interface. It | ||
| # will ultimately depend on how we want to publicly expose it. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was addressed by #959 and there is another TODO to remove the legacy use of |
||
| if device_str == "cuda:0:beta": | ||
| return "cuda", "beta" | ||
| else: | ||
|
|
@@ -750,7 +747,7 @@ def sample_format(self) -> str: | |
|
|
||
|
|
||
| def supports_approximate_mode(asset: TestVideo) -> bool: | ||
| # TODONVDEC P2: open an issue about his. That's actually not related to | ||
| # NVDEC at all, those don't support approximate mode because they don't set | ||
| # a duration. CPU decoder fails too! | ||
| # Those are missing the `duration` field so they fail in approximate mode (on all devices). | ||
| # TODO: we should address this, see | ||
| # https://github.com/meta-pytorch/torchcodec/issues/945 | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't an NVDEC TODO, it's a general TODO. |
||
| return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Above and everywhere else in the PR, anything related to "device index" relates to the creation of
getDeviceIndex()which is now inCUDACommon.cpp, and replaces the old (buggy!)getNonNegativeDeviceIndex.