hw_context_vulkan: Add support for transfers to cuda

Although transfers from cuda to vulkan are currently implemented as a map rather than a transfer, this approach doesn't work in the other direction; it relies on being able to manually initialise the destination buffer pool which cannot cleanly be done from hwcontext_vulkan for a cuda destination. Rather than trying to work-around that, this change implements the transfer from vulkan to cuda as an actual transfer, using vf_hwupload[_cuda]. The change has three main parts. 1) hwcontext_vulkan: We implement a special case for transfer_to for a cuda destination. This is another GPU memcpy, and we refactor all the interop logic for reuse with map_from_cuda. We don't use the semaphores here, because cuda will naturally serialise subsequent operations after the memcpy. A semaphore would be needed if the vulkan frame might not have been completely written before the memcpy is issued, but I think this is never the case in the current design. 2) hwupload_cuda: Here we need to do two things: Declare that we can accept AV_PIX_FMT_VULKAN as an input format, and extract the sw_format if the input is a hardware frame. 3) hwupload: Here we make the same change to handle hardware input formats, but the declaration of support for vulkan happens in hwcontext_cuda due to get_constraints being used. This change is not final for one main reason. Because the filter format matching logic isn't properly hw format aware, we have to declare vulkan as a supported sw format, which means that we can't separately check if the sw format is supported unless we do a second check within the filter. This really should be pushed up into the common logic. Basically, filters should be able to separately declare their supported input hw and sw formats. We might also need some additional logic to handle when the vulkan and cuda devices aren't the same hardware. I haven't looked at the failure paths yet. This basic approach could be used instead of hwmap for cuda to vulkan transfers, but there are complications: hwcontext will attempt a src->transfer_to() if available in preference to a dst->transfer_from() with no fallback if the transfer_to() fails. Without changes, that implies the cuda hwcontext must implement a transfer_to. The second complication as that you'd have two hwuploads but pointed to different hw devices, and I don't think you can specify a different device for each hwupload - I guess you'd need to add logic to hwupload to be able to init itself. (A workaround here would be to use hwupload for the cuda->vulkan and then hwupload_cuda for the vulkan->cuda but we really want to get rid of hwupload_cuda in the long term). Example command lines: $ ffmpeg -hwaccel nvdec -hwaccel_output_format cuda \ -init_hw_device cuda=cuda:0 -i sample.mp4 -filter_hw_device cuda \ -vf hwmap=derive_device=vulkan,scale_vulkan=w=640:h=480,hwupload \ -c:v h264_nvenc -f null /dev/null $ ffmpeg -hwaccel nvdec -hwaccel_output_format cuda \ -i sample.mp4 \ -vf hwmap=derive_device=vulkan,scale_vulkan=w=640:h=480,hwupload_cuda \ -c:v h264_nvenc -f null /dev/null As you can see, hwupload_cuda is significantly more concise as it can directly init a hw device by itself.
philipl · Sep 23, 2019 · 9be8e74 · 9be8e74
1 parent 728fe50
commit 9be8e74
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 31 deletions.
diff --git a/libavfilter/vf_hwupload.c b/libavfilter/vf_hwupload.c
@@ -128,6 +128,12 @@ static int hwupload_config_output(AVFilterLink *outlink)
 
     ctx->hwframes->format    = outlink->format;
     ctx->hwframes->sw_format = inlink->format;
+    if (inlink->hw_frames_ctx) {
+        AVHWFramesContext *in_hwframe_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+        ctx->hwframes->sw_format = in_hwframe_ctx->sw_format;
+    } else {
+        ctx->hwframes->sw_format = inlink->format;
+    }
     ctx->hwframes->width     = inlink->w;
     ctx->hwframes->height    = inlink->h;
 

diff --git a/libavfilter/vf_hwupload_cuda.c b/libavfilter/vf_hwupload_cuda.c
@@ -59,7 +59,7 @@ static int cudaupload_query_formats(AVFilterContext *ctx)
     static const enum AVPixelFormat input_pix_fmts[] = {
         AV_PIX_FMT_NV12, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16,
-        AV_PIX_FMT_0RGB32, AV_PIX_FMT_0BGR32,
+        AV_PIX_FMT_0RGB32, AV_PIX_FMT_0BGR32, AV_PIX_FMT_VULKAN,
         AV_PIX_FMT_NONE,
     };
     static const enum AVPixelFormat output_pix_fmts[] = {
@@ -97,7 +97,12 @@ static int cudaupload_config_output(AVFilterLink *outlink)
 
     hwframe_ctx            = (AVHWFramesContext*)s->hwframe->data;
     hwframe_ctx->format    = AV_PIX_FMT_CUDA;
-    hwframe_ctx->sw_format = inlink->format;
+    if (inlink->hw_frames_ctx) {
+        AVHWFramesContext *in_hwframe_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+        hwframe_ctx->sw_format = in_hwframe_ctx->sw_format;
+    } else {
+        hwframe_ctx->sw_format = inlink->format;
+    }
     hwframe_ctx->width     = inlink->w;
     hwframe_ctx->height    = inlink->h;
 

diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
@@ -42,6 +42,7 @@ static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_YUV444P16,
     AV_PIX_FMT_0RGB32,
     AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_VULKAN,
 };
 
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
@@ -1863,13 +1863,12 @@ static int vulkan_map_from_vaapi(AVHWFramesContext *dst_fc,
 #endif
 
 #if CONFIG_CUDA
-static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
-                                AVFrame *dst, const AVFrame *src,
-                                int flags)
+static int vulkan_export_to_cuda(AVHWFramesContext *hwfc,
+                                 AVBufferRef *cuda_hwfc,
+                                 const AVFrame *frame)
 {
     int err;
     VkResult ret;
-    CUcontext dummy;
     AVVkFrame *dst_f;
     AVVkFrameInternal *dst_int;
     AVHWDeviceContext *ctx = hwfc->device_ctx;
@@ -1879,37 +1878,15 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
     VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
     VK_LOAD_PFN(hwctx->inst, vkGetSemaphoreFdKHR);
 
-    AVHWFramesContext *cuda_fc = (AVHWFramesContext*)src->hw_frames_ctx->data;
+    AVHWFramesContext *cuda_fc = (AVHWFramesContext*)cuda_hwfc->data;
     AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
     AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
     AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
     CudaFunctions *cu = cu_internal->cuda_dl;
-    CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS s_w_par[AV_NUM_DATA_POINTERS] = { 0 };
-    CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS s_s_par[AV_NUM_DATA_POINTERS] = { 0 };
     CUarray_format cufmt = desc->comp[0].depth > 8 ? CU_AD_FORMAT_UNSIGNED_INT16 :
                                                      CU_AD_FORMAT_UNSIGNED_INT8;
 
-    if (!hwfc->internal->pool_internal) {
-        AVVulkanFramesContext *vkfc = hwfc->hwctx;
-        vkfc->tiling = VK_IMAGE_TILING_OPTIMAL;
-        vulkan_frames_init(hwfc);
-    }
-
-    dst->buf[0] = av_buffer_pool_get(hwfc->internal->pool_internal);
-    if (!dst->buf[0])
-        return AVERROR(ENOMEM);
-
-    dst->data[0] = dst->buf[0]->data;
-    dst->width   = src->width;
-    dst->height  = src->height;
-
-    dst_f = (AVVkFrame *)dst->data[0];
-
-    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
-    if (ret < 0) {
-        err = AVERROR_EXTERNAL;
-        goto fail;
-    }
+    dst_f = (AVVkFrame *)frame->data[0];
 
     dst_int = dst_f->internal;
     if (!dst_int || !dst_int->cuda_fc_ref) {
@@ -1921,7 +1898,7 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
             goto fail;
         }
 
-        dst_int->cuda_fc_ref = av_buffer_ref(src->hw_frames_ctx);
+        dst_int->cuda_fc_ref = av_buffer_ref(cuda_hwfc);
         if (!dst_int->cuda_fc_ref) {
             err = AVERROR(ENOMEM);
             goto fail;
@@ -2005,6 +1982,59 @@ static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
             }
         }
     }
+    return 0;
+
+fail:
+    return -err;
+}
+
+static int vulkan_map_from_cuda(AVHWFramesContext *hwfc,
+                                AVFrame *dst, const AVFrame *src,
+                                int flags)
+{
+    int err;
+    VkResult ret;
+    CUcontext dummy;
+    AVVkFrame *dst_f;
+    AVVkFrameInternal *dst_int;
+    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+
+    AVHWFramesContext *cuda_fc = (AVHWFramesContext*)src->hw_frames_ctx->data;
+    AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
+    AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
+    AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
+    CudaFunctions *cu = cu_internal->cuda_dl;
+    CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS s_w_par[AV_NUM_DATA_POINTERS] = { 0 };
+    CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS s_s_par[AV_NUM_DATA_POINTERS] = { 0 };
+
+    if (!hwfc->internal->pool_internal) {
+        AVVulkanFramesContext *vkfc = hwfc->hwctx;
+        vkfc->tiling = VK_IMAGE_TILING_OPTIMAL;
+        vulkan_frames_init(hwfc);
+    }
+
+    dst->buf[0] = av_buffer_pool_get(hwfc->internal->pool_internal);
+    if (!dst->buf[0])
+        return AVERROR(ENOMEM);
+
+    dst->data[0] = dst->buf[0]->data;
+    dst->width   = src->width;
+    dst->height  = src->height;
+
+    dst_f = (AVVkFrame *)dst->data[0];
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
+    if (ret < 0) {
+        err = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    ret = vulkan_export_to_cuda(hwfc, src->hw_frames_ctx, dst);
+    if (ret < 0) {
+        goto fail;
+    }
+    dst_int = dst_f->internal;
 
     ret = CHECK_CU(cu->cuWaitExternalSemaphoresAsync(dst_int->cu_sem, s_w_par,
                                                      planes, cuda_dev->stream));
@@ -2579,6 +2609,76 @@ static int vulkan_transfer_data_to(AVHWFramesContext *hwfc, AVFrame *dst,
     return err;
 }
 
+#if CONFIG_CUDA
+static int vulkan_transfer_data_to_cuda(AVHWFramesContext *hwfc, AVFrame *dst,
+                                     const AVFrame *src)
+{
+    int err;
+    VkResult ret;
+    CUcontext dummy;
+    AVVkFrame *dst_f;
+    AVVkFrameInternal *dst_int;
+    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+
+    AVHWFramesContext *cuda_fc = (AVHWFramesContext*)dst->hw_frames_ctx->data;
+    AVHWDeviceContext *cuda_cu = cuda_fc->device_ctx;
+    AVCUDADeviceContext *cuda_dev = cuda_cu->hwctx;
+    AVCUDADeviceContextInternal *cu_internal = cuda_dev->internal;
+    CudaFunctions *cu = cu_internal->cuda_dl;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_dev->cuda_ctx));
+    if (ret < 0) {
+        err = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    dst_f = (AVVkFrame *)src->data[0];
+
+    err = vulkan_export_to_cuda(hwfc, dst->hw_frames_ctx, src);
+    if (err < 0) {
+        goto fail;
+    }
+
+    dst_int = dst_f->internal;
+
+    for (int i = 0; i < planes; i++) {
+        CUDA_MEMCPY2D cpy = {
+            .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+            .dstDevice     = (CUdeviceptr)dst->data[i],
+            .dstPitch      = dst->linesize[i],
+            .dstY          = 0,
+
+            .srcMemoryType = CU_MEMORYTYPE_ARRAY,
+            .srcArray      = dst_int->cu_array[i],
+            .WidthInBytes  = (i > 0 ? AV_CEIL_RSHIFT(hwfc->width, desc->log2_chroma_w)
+                                    : hwfc->width) * desc->comp[i].step,
+            .Height        = i > 0 ? AV_CEIL_RSHIFT(hwfc->height, desc->log2_chroma_h)
+                                   : hwfc->height,
+        };
+
+        ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, cuda_dev->stream));
+        if (ret < 0) {
+            err = AVERROR_EXTERNAL;
+            goto fail;
+        }
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    av_log(hwfc, AV_LOG_VERBOSE, "Transfered Vulkan image to CUDA!\n");
+
+    return 0;
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    vulkan_free_internal(dst_int);
+    dst_f->internal = NULL;
+    av_buffer_unref(&dst->buf[0]);
+    return err;
+}
+#endif
+
 static int vulkan_transfer_data_from(AVHWFramesContext *hwfc, AVFrame *dst,
                                      const AVFrame *src)
 {
@@ -2593,6 +2693,16 @@ static int vulkan_transfer_data_from(AVHWFramesContext *hwfc, AVFrame *dst,
     if (dst->width > hwfc->width || dst->height > hwfc->height)
         return AVERROR(EINVAL);
 
+    switch (dst->format) {
+#if CONFIG_CUDA
+    case AV_PIX_FMT_CUDA:
+        return vulkan_transfer_data_to_cuda(hwfc, dst, src);
+#endif
+    default:
+        /* Carry on. */
+        break;
+    }
+
     /* For linear, host visiable images */
     if (f->tiling == VK_IMAGE_TILING_LINEAR &&
         f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {