Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/torchcodec/decoders/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
function(make_torchcodec_library library_name ffmpeg_target)
set(
sources
CUDACommon.h
CUDACommon.cpp
FFMPEGCommon.h
FFMPEGCommon.cpp
VideoDecoder.h
Expand Down
147 changes: 147 additions & 0 deletions src/torchcodec/decoders/_core/CUDACommon.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "src/torchcodec/decoders/_core/CUDACommon.h"

// This source file is organized in the following way:
//
// | <general includes>
// |
// | <CUDA ifdef guard>
// | <CUDA specific includes>
// | <CUDA specific code>
// | </CUDA ifdef guard>
// |
// | <API for general code to access CUDA specific behavior.>
//
// If code needs to access definitions in the CUDA specific includes, then it is
// CUDA specific code, and belongs inside of the guard. If that behavior needs
// to be accessible to general code, then it should be added to the API for
// general code.

#ifdef ENABLE_CUDA

#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <npp.h>

extern "C" {
#include <libavutil/hwcontext_cuda.h>
#include <libavutil/pixdesc.h>
}

namespace facebook::torchcodec {
namespace {

AVBufferRef* getCudaContext() {
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
int err = 0;
AVBufferRef* hw_device_ctx;
err = av_hwdevice_ctx_create(
&hw_device_ctx,
type,
nullptr,
nullptr,
// Introduced in 58.26.100:
// https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
AV_CUDA_USE_CURRENT_CONTEXT
#else
0
#endif
);
if (err < 0) {
TORCH_CHECK(
false,
"Failed to create specified HW device",
getFFMPEGErrorStringFromErrorCode(err));
}
return hw_device_ctx;
}

torch::Tensor allocateDeviceTensor(
at::IntArrayRef shape,
torch::Device device,
const torch::Dtype dtype = torch::kUInt8) {
return torch::empty(
shape,
torch::TensorOptions()
.dtype(dtype)
.layout(torch::kStrided)
.device(device));
}

} // namespace

} // namespace facebook::torchcodec

#endif // ENABLE_CUDA

// API implementations for general code to access CUDA specific behaviors.

namespace facebook::torchcodec {

AVBufferRef* initializeCudaContext(const torch::Device& device) {
#ifdef ENABLE_CUDA
TORCH_CHECK(device.type() == torch::DeviceType::CUDA, "Invalid device type.");

// We create a small tensor using pytorch to initialize the cuda context.
torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
{1},
torch::TensorOptions().dtype(torch::kUInt8).device(device));
return av_buffer_ref(getCudaContext());
#else
throw std::runtime_error(
"CUDA support is not enabled in this build of TorchCodec.");
#endif
}

torch::Tensor convertFrameToTensorUsingCuda(
const AVCodecContext* codecContext,
const VideoDecoder::VideoStreamDecoderOptions& options,
const AVFrame* src) {
#ifdef ENABLE_CUDA
NVTX_SCOPED_RANGE("convertFrameUsingCuda");
TORCH_CHECK(
src->format == AV_PIX_FMT_CUDA,
"Expected format to be AV_PIX_FMT_CUDA, got " +
std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
int width = options.width.value_or(codecContext->width);
int height = options.height.value_or(codecContext->height);
NppStatus status;
NppiSize oSizeROI;
oSizeROI.width = width;
oSizeROI.height = height;
Npp8u* input[2];
input[0] = (Npp8u*)src->data[0];
input[1] = (Npp8u*)src->data[1];
torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
auto start = std::chrono::high_resolution_clock::now();
status = nppiNV12ToRGB_8u_P2C3R(
input,
src->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> duration = end - start;
VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
<< " took: " << duration.count() << "us" << std::endl;
if (options.dimensionOrder == "NCHW") {
// The docs guaranty this to return a view:
// https://pytorch.org/docs/stable/generated/torch.permute.html
dst = dst.permute({2, 0, 1});
}
return dst;
#else
throw std::runtime_error(
"CUDA support is not enabled in this build of TorchCodec.");
#endif
}

} // namespace facebook::torchcodec
45 changes: 45 additions & 0 deletions src/torchcodec/decoders/_core/CUDACommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include <torch/torch.h>
#include <torch/types.h>
#include "src/torchcodec/decoders/_core/VideoDecoder.h"

#ifdef ENABLE_NVTX
#include <nvtx3/nvtx3.hpp>
#endif

// The API for general code to access CUDA specific behaviors. CUDA specific
// behaviors require CUDA specific definitions which are only available on
// systems with CUDA installed. Hence, CUDA specific behaviors have to be
// guarded with ifdefs.
//
// In order to prevent ifdefs in general code, we create an API with a function
// for each behavior we need. General code can call the API, as the correct
// guards happen internally. General code still needs to check in general code
// if CUDA is being used, as the functions will throw an exception if CUDA is
// not available.

namespace facebook::torchcodec {

#ifdef ENABLE_NVTX
#define NVTX_SCOPED_RANGE(Annotation) nvtx3::scoped_range loop{Annotation}
#else
#define NVTX_SCOPED_RANGE(Annotation) \
do { \
} while (0)
#endif

AVBufferRef* initializeCudaContext(const torch::Device& device);

torch::Tensor convertFrameToTensorUsingCuda(
const AVCodecContext* codecContext,
const VideoDecoder::VideoStreamDecoderOptions& options,
const AVFrame* src);

} // namespace facebook::torchcodec
127 changes: 7 additions & 120 deletions src/torchcodec/decoders/_core/VideoDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,14 @@
#include <stdexcept>
#include <string_view>

#ifdef ENABLE_CUDA
#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <npp.h>
#ifdef ENABLE_NVTX
#include <nvtx3/nvtx3.hpp>
#endif
#endif
#include "src/torchcodec/decoders/_core/CUDACommon.h"

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavfilter/buffersink.h>
#include <libavfilter/buffersrc.h>
#include <libavformat/avformat.h>
#include <libavutil/imgutils.h>
#include <libavutil/pixdesc.h>
#ifdef ENABLE_CUDA
#include <libavutil/hwcontext_cuda.h>
#endif
}

namespace facebook::torchcodec {
Expand Down Expand Up @@ -107,87 +96,6 @@ std::vector<std::string> splitStringWithDelimiters(
return result;
}

#ifdef ENABLE_CUDA

AVBufferRef* getCudaContext() {
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
int err = 0;
AVBufferRef* hw_device_ctx;
err = av_hwdevice_ctx_create(
&hw_device_ctx,
type,
nullptr,
nullptr,
// Introduced in 58.26.100:
// https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
AV_CUDA_USE_CURRENT_CONTEXT
#else
0
#endif
);
if (err < 0) {
TORCH_CHECK(
false,
"Failed to create specified HW device",
getFFMPEGErrorStringFromErrorCode(err));
}
return hw_device_ctx;
}

torch::Tensor allocateDeviceTensor(
at::IntArrayRef shape,
torch::Device device,
const torch::Dtype dtype = torch::kUInt8) {
return torch::empty(
shape,
torch::TensorOptions()
.dtype(dtype)
.layout(torch::kStrided)
.device(device));
}

torch::Tensor convertFrameToTensorUsingCUDA(
const AVCodecContext* codecContext,
const VideoDecoder::VideoStreamDecoderOptions& options,
const AVFrame* src) {
TORCH_CHECK(
src->format == AV_PIX_FMT_CUDA,
"Expected format to be AV_PIX_FMT_CUDA, got " +
std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
int width = options.width.value_or(codecContext->width);
int height = options.height.value_or(codecContext->height);
NppStatus status;
NppiSize oSizeROI;
oSizeROI.width = width;
oSizeROI.height = height;
Npp8u* input[2];
input[0] = (Npp8u*)src->data[0];
input[1] = (Npp8u*)src->data[1];
torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
auto start = std::chrono::high_resolution_clock::now();
status = nppiNV12ToRGB_8u_P2C3R(
input,
src->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> duration = end - start;
VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
<< " took: " << duration.count() << "us" << std::endl;
if (options.dimensionOrder == "NCHW") {
// The docs guaranty this to return a view:
// https://pytorch.org/docs/stable/generated/torch.permute.html
dst = dst.permute({2, 0, 1});
}
return dst;
}

#endif

} // namespace

VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
Expand Down Expand Up @@ -490,21 +398,12 @@ void VideoDecoder::addVideoStreamDecoder(
TORCH_CHECK_EQ(retVal, AVSUCCESS);

if (options.device.type() == torch::DeviceType::CUDA) {
#ifdef ENABLE_CUDA
// We create a small tensor using pytorch to initialize the cuda context.
torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
{1},
torch::TensorOptions().dtype(torch::kUInt8).device(options.device));
codecContext->hw_device_ctx = av_buffer_ref(getCudaContext());
codecContext->hw_device_ctx = initializeCudaContext(options.device);

TORCH_INTERNAL_ASSERT(
codecContext->hw_device_ctx,
"Failed to create/reference the CUDA HW device context for index=" +
std::to_string(options.device.index()) + ".");
#else
throw std::runtime_error(
"CUDA support is not enabled in this build of TorchCodec.");
#endif
}

retVal = avcodec_open2(streamInfo.codecContext.get(), codec, nullptr);
Expand Down Expand Up @@ -765,9 +664,8 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {

VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
std::function<bool(int, AVFrame*)> filterFunction) {
#ifdef ENABLE_NVTX
nvtx3::scoped_range loop{"decodeOneFrame"};
#endif
NVTX_SCOPED_RANGE("decodeOneFrame");

if (activeStreamIndices_.size() == 0) {
throw std::runtime_error("No active streams configured.");
}
Expand Down Expand Up @@ -856,9 +754,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
continue;
}
{
#ifdef ENABLE_NVTX
nvtx3::scoped_range loop{"avcodec_send_packet"};
#endif
NVTX_SCOPED_RANGE("avcodec_send_packet");
ffmpegStatus = avcodec_send_packet(
streams_[packet->stream_index].codecContext.get(), packet.get());
}
Expand Down Expand Up @@ -912,17 +808,8 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
output.frame =
convertFrameToTensorUsingFilterGraph(streamIndex, frame.get());
} else if (streamInfo.options.device.is_cuda()) {
#ifdef ENABLE_CUDA
{
#ifdef ENABLE_NVTX
nvtx3::scoped_range loop{"convertFrameUsingCuda"};
#endif
output.frame = convertFrameToTensorUsingCUDA(
streamInfo.codecContext.get(), streamInfo.options, frame.get());
}
#else
throw std::runtime_error("CUDA is not enabled in this build.");
#endif // ENABLE_CUDA
output.frame = convertFrameToTensorUsingCuda(
streamInfo.codecContext.get(), streamInfo.options, frame.get());
}
} else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
// TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement
Expand Down
Loading