Skip to content

Commit

Permalink
hw_context_vulkan: Add support for transfers to cuda
Browse files Browse the repository at this point in the history
Although transfers from cuda to vulkan are currently implemented
as a map rather than a transfer, this approach doesn't work in
the other direction; it relies on being able to manually initialise
the destination buffer pool which cannot cleanly be done from
hwcontext_vulkan for a cuda destination.

Rather than trying to work-around that, this change implements
the transfer from vulkan to cuda as an actual transfer, using
vf_hwupload[_cuda]. The change has three main parts.

1) hwcontext_vulkan: We implement a special case for transfer_to
for a cuda destination. This is another GPU memcpy, and we refactor
all the interop logic for reuse with map_from_cuda. We don't use
the semaphores here, because cuda will naturally serialise subsequent
operations after the memcpy. A semaphore would be needed if the
vulkan frame might not have been completely written before the
memcpy is issued, but I think this is never the case in the current
design.

2) hwupload_cuda: Here we need to do two things: Declare that
we can accept AV_PIX_FMT_VULKAN as an input format, and extract
the sw_format if the input is a hardware frame.

3) hwupload: Here we make the same change to handle hardware input
formats, but the declaration of support for vulkan happens in
hwcontext_cuda due to get_constraints being used.

This change is not final for one main reason. Because the
filter format matching logic isn't properly hw format aware, we
have to declare vulkan as a supported sw format, which means
that we can't separately check if the sw format is supported unless
we do a second check within the filter. This really should be
pushed up into the common logic. Basically, filters should be
able to separately declare their supported input hw and sw formats.

We might also need some additional logic to handle when the vulkan
and cuda devices aren't the same hardware. I haven't looked at the
failure paths yet.

This basic approach could be used instead of hwmap for cuda to
vulkan transfers, but there are complications: hwcontext will attempt
a src->transfer_to() if available in preference to a dst->transfer_from()
with no fallback if the transfer_to() fails. Without changes, that
implies the cuda hwcontext must implement a transfer_to.

The second complication as that you'd have two hwuploads but pointed
to different hw devices, and I don't think you can specify a different
device for each hwupload - I guess you'd need to add logic to hwupload
to be able to init itself. (A workaround here would be to use
hwupload for the cuda->vulkan and then hwupload_cuda for the vulkan->cuda
but we really want to get rid of hwupload_cuda in the long term).

Example command lines:

$ ffmpeg -hwaccel nvdec -hwaccel_output_format cuda \
  -init_hw_device cuda=cuda:0 -i sample.mp4 -filter_hw_device cuda \
  -vf hwmap=derive_device=vulkan,scale_vulkan=w=640:h=480,hwupload \
  -c:v h264_nvenc -f null /dev/null

$ ffmpeg -hwaccel nvdec -hwaccel_output_format cuda \
  -i sample.mp4 \
  -vf hwmap=derive_device=vulkan,scale_vulkan=w=640:h=480,hwupload_cuda \
  -c:v h264_nvenc -f null /dev/null

As you can see, hwupload_cuda is significantly more concise as it can
directly init a hw device by itself.
  • Loading branch information
philipl committed Sep 23, 2019
1 parent 728fe50 commit 9be8e74
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 31 deletions.
6 changes: 6 additions & 0 deletions libavfilter/vf_hwupload.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ static int hwupload_config_output(AVFilterLink *outlink)

ctx->hwframes->format = outlink->format;
ctx->hwframes->sw_format = inlink->format;
if (inlink->hw_frames_ctx) {
AVHWFramesContext *in_hwframe_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
ctx->hwframes->sw_format = in_hwframe_ctx->sw_format;
} else {
ctx->hwframes->sw_format = inlink->format;
}
ctx->hwframes->width = inlink->w;
ctx->hwframes->height = inlink->h;

Expand Down
9 changes: 7 additions & 2 deletions libavfilter/vf_hwupload_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ static int cudaupload_query_formats(AVFilterContext *ctx)
static const enum AVPixelFormat input_pix_fmts[] = {
AV_PIX_FMT_NV12, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16,
AV_PIX_FMT_0RGB32, AV_PIX_FMT_0BGR32,
AV_PIX_FMT_0RGB32, AV_PIX_FMT_0BGR32, AV_PIX_FMT_VULKAN,
AV_PIX_FMT_NONE,
};
static const enum AVPixelFormat output_pix_fmts[] = {
Expand Down Expand Up @@ -97,7 +97,12 @@ static int cudaupload_config_output(AVFilterLink *outlink)

hwframe_ctx = (AVHWFramesContext*)s->hwframe->data;
hwframe_ctx->format = AV_PIX_FMT_CUDA;
hwframe_ctx->sw_format = inlink->format;
if (inlink->hw_frames_ctx) {
AVHWFramesContext *in_hwframe_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
hwframe_ctx->sw_format = in_hwframe_ctx->sw_format;
} else {
hwframe_ctx->sw_format = inlink->format;
}
hwframe_ctx->width = inlink->w;
hwframe_ctx->height = inlink->h;

Expand Down
1 change: 1 addition & 0 deletions libavutil/hwcontext_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ static const enum AVPixelFormat supported_formats[] = {
AV_PIX_FMT_YUV444P16,
AV_PIX_FMT_0RGB32,
AV_PIX_FMT_0BGR32,
AV_PIX_FMT_VULKAN,
};

#define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
Expand Down
168 changes: 139 additions & 29 deletions libavutil/hwcontext_vulkan.c
Original file line number Diff line number Diff line change
Expand Up @@ -1863,13 +1863,12 @@ static int vulkan_map_from_vaapi(AVHWFramesContext *dst_fc,
#endif

#if CONFIG_CUDA
static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
AVFrame *dst, const AVFrame *src,
int flags)
static int vulkan_export_to_cuda(AVHWFramesContext *hwfc,
AVBufferRef *cuda_hwfc,
const AVFrame *frame)
{
int err;
VkResult ret;
CUcontext dummy;
AVVkFrame *dst_f;
AVVkFrameInternal *dst_int;
AVHWDeviceContext *ctx = hwfc->device_ctx;
Expand All @@ -1879,37 +1878,15 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
VK_LOAD_PFN(hwctx->inst, vkGetSemaphoreFdKHR);

AVHWFramesContext *cuda_fc = (AVHWFramesContext*)src->hw_frames_ctx->data;
AVHWFramesContext *cuda_fc = (AVHWFramesContext*)cuda_hwfc->data;
AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
CudaFunctions *cu = cu_internal->cuda_dl;
CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS s_w_par[AV_NUM_DATA_POINTERS] = { 0 };
CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS s_s_par[AV_NUM_DATA_POINTERS] = { 0 };
CUarray_format cufmt = desc->comp[0].depth > 8 ? CU_AD_FORMAT_UNSIGNED_INT16 :
CU_AD_FORMAT_UNSIGNED_INT8;

if (!hwfc->internal->pool_internal) {
AVVulkanFramesContext *vkfc = hwfc->hwctx;
vkfc->tiling = VK_IMAGE_TILING_OPTIMAL;
vulkan_frames_init(hwfc);
}

dst->buf[0] = av_buffer_pool_get(hwfc->internal->pool_internal);
if (!dst->buf[0])
return AVERROR(ENOMEM);

dst->data[0] = dst->buf[0]->data;
dst->width = src->width;
dst->height = src->height;

dst_f = (AVVkFrame *)dst->data[0];

ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
if (ret < 0) {
err = AVERROR_EXTERNAL;
goto fail;
}
dst_f = (AVVkFrame *)frame->data[0];

dst_int = dst_f->internal;
if (!dst_int || !dst_int->cuda_fc_ref) {
Expand All @@ -1921,7 +1898,7 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
goto fail;
}

dst_int->cuda_fc_ref = av_buffer_ref(src->hw_frames_ctx);
dst_int->cuda_fc_ref = av_buffer_ref(cuda_hwfc);
if (!dst_int->cuda_fc_ref) {
err = AVERROR(ENOMEM);
goto fail;
Expand Down Expand Up @@ -2005,6 +1982,59 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
}
}
}
return 0;

fail:
return -err;
}

static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
AVFrame *dst, const AVFrame *src,
int flags)
{
int err;
VkResult ret;
CUcontext dummy;
AVVkFrame *dst_f;
AVVkFrameInternal *dst_int;
const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);

AVHWFramesContext *cuda_fc = (AVHWFramesContext*)src->hw_frames_ctx->data;
AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
CudaFunctions *cu = cu_internal->cuda_dl;
CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS s_w_par[AV_NUM_DATA_POINTERS] = { 0 };
CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS s_s_par[AV_NUM_DATA_POINTERS] = { 0 };

if (!hwfc->internal->pool_internal) {
AVVulkanFramesContext *vkfc = hwfc->hwctx;
vkfc->tiling = VK_IMAGE_TILING_OPTIMAL;
vulkan_frames_init(hwfc);
}

dst->buf[0] = av_buffer_pool_get(hwfc->internal->pool_internal);
if (!dst->buf[0])
return AVERROR(ENOMEM);

dst->data[0] = dst->buf[0]->data;
dst->width = src->width;
dst->height = src->height;

dst_f = (AVVkFrame *)dst->data[0];

ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
if (ret < 0) {
err = AVERROR_EXTERNAL;
goto fail;
}

ret = vulkan_export_to_cuda(hwfc, src->hw_frames_ctx, dst);
if (ret < 0) {
goto fail;
}
dst_int = dst_f->internal;

ret = CHECK_CU(cu->cuWaitExternalSemaphoresAsync(dst_int->cu_sem, s_w_par,
planes, cuda_dev->stream));
Expand Down Expand Up @@ -2579,6 +2609,76 @@ static int vulkan_transfer_data_to(AVHWFramesContext *hwfc, AVFrame *dst,
return err;
}

#if CONFIG_CUDA
static int vulkan_transfer_data_to_cuda(AVHWFramesContext *hwfc, AVFrame *dst,
const AVFrame *src)
{
int err;
VkResult ret;
CUcontext dummy;
AVVkFrame *dst_f;
AVVkFrameInternal *dst_int;
const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);

AVHWFramesContext *cuda_fc = (AVHWFramesContext*)dst->hw_frames_ctx->data;
AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
CudaFunctions *cu = cu_internal->cuda_dl;

ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
if (ret < 0) {
err = AVERROR_EXTERNAL;
goto fail;
}

dst_f = (AVVkFrame *)src->data[0];

err = vulkan_export_to_cuda(hwfc, dst->hw_frames_ctx, src);
if (err < 0) {
goto fail;
}

dst_int = dst_f->internal;

for (int i = 0; i < planes; i++) {
CUDA_MEMCPY2D cpy = {
.dstMemoryType = CU_MEMORYTYPE_DEVICE,
.dstDevice = (CUdeviceptr)dst->data[i],
.dstPitch = dst->linesize[i],
.dstY = 0,

.srcMemoryType = CU_MEMORYTYPE_ARRAY,
.srcArray = dst_int->cu_array[i],
.WidthInBytes = (i > 0 ? AV_CEIL_RSHIFT(hwfc->width, desc->log2_chroma_w)
: hwfc->width) * desc->comp[i].step,
.Height = i > 0 ? AV_CEIL_RSHIFT(hwfc->height, desc->log2_chroma_h)
: hwfc->height,
};

ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, cuda_dev->stream));
if (ret < 0) {
err = AVERROR_EXTERNAL;
goto fail;
}
}

CHECK_CU(cu->cuCtxPopCurrent(&dummy));

av_log(hwfc, AV_LOG_VERBOSE, "Transfered Vulkan image to CUDA!\n");

return 0;

fail:
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
vulkan_free_internal(dst_int);
dst_f->internal = NULL;
av_buffer_unref(&dst->buf[0]);
return err;
}
#endif

static int vulkan_transfer_data_from(AVHWFramesContext *hwfc, AVFrame *dst,
const AVFrame *src)
{
Expand All @@ -2593,6 +2693,16 @@ static int vulkan_transfer_data_from(AVHWFramesContext *hwfc, AVFrame *dst,
if (dst->width > hwfc->width || dst->height > hwfc->height)
return AVERROR(EINVAL);

switch (dst->format) {
#if CONFIG_CUDA
case AV_PIX_FMT_CUDA:
return vulkan_transfer_data_to_cuda(hwfc, dst, src);
#endif
default:
/* Carry on. */
break;
}

/* For linear, host visiable images */
if (f->tiling == VK_IMAGE_TILING_LINEAR &&
f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
Expand Down

0 comments on commit 9be8e74

Please sign in to comment.