mpv-player · mia-0 · Dec 24, 2017 · Sep 27, 2017 · Sep 28, 2017 · Sep 28, 2017
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
@@ -4267,11 +4267,25 @@ The following video options are currently all specific to ``--vo=gpu`` and
     Controls the number of VkQueues used for rendering (limited by how many
     your device supports). In theory, using more queues could enable some
     parallelism between frames (when using a ``--swapchain-depth`` higher than
-    1). (Default: 1)
-
-    NOTE: Setting this to a value higher than 1 may cause graphical corruption,
-    as mpv's vulkan implementation currently does not try and protect textures
-    against concurrent access.
+    1), but it can also slow things down on hardware where there's no true
+    parallelism between queues. (Default: 1)
+
+``--vulkan-async-transfer``
+    Enables the use of async transfer queues on supported vulkan devices. Using
+    them allows transfer operations like texture uploads and blits to happen
+    concurrently with the actual rendering, thus improving overall throughput
+    and power consumption. Enabled by default, and should be relatively safe.
+
+``--vulkan-async-compute``
+    Enables the use of async compute queues on supported vulkan devices. Using
+    this, in theory, allows out-of-order scheduling of compute shaders with
+    graphics shaders, thus enabling the hardware to do more effective work while
+    waiting for pipeline bubbles and memory operations. Not beneficial on all
+    GPUs. It's worth noting that if async compute is enabled, and the device
+    supports more compute queues than graphics queues (bound by the restrictions
+    set by ``--vulkan-queue-count``), mpv will internally try and prefer the
+    use of compute shaders over fragment shaders wherever possible. Not enabled
+    by default, since it seems to cause issues with some drivers.
 
 ``--d3d11-warp=<yes|no|auto>``
     Use WARP (Windows Advanced Rasterization Platform) with the D3D11 GPU

diff --git a/ta/ta_talloc.h b/ta/ta_talloc.h
@@ -124,6 +124,13 @@ char *ta_talloc_asprintf_append_buffer(char *s, const char *fmt, ...) TA_PRF(2,
         (idxvar)--;                                 \
     } while (0)
 
+// Returns whether or not there was any element to pop.
+#define MP_TARRAY_POP(p, idxvar, out)               \
+    ((idxvar) > 0                                   \
+        ? (*(out) = (p)[--(idxvar)], true)          \
+        : false                                     \
+    )
+
 #define talloc_struct(ctx, type, ...) \
     talloc_memdup(ctx, &(type) TA_EXPAND_ARGS(__VA_ARGS__), sizeof(type))
 

diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
@@ -314,7 +314,7 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
     const int *factors = &blend_factors[part->format][0];
     gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
 
-    gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
+    gl_sc_dispatch_draw(sc, fbo.tex, false, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
                         sizeof(struct vertex), part->vertices, part->num_vertices);
 }
 

diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
@@ -53,6 +53,7 @@ enum {
     RA_CAP_GLOBAL_UNIFORM = 1 << 8, // supports using "naked" uniforms (not UBO)
     RA_CAP_GATHER         = 1 << 9, // supports textureGather in GLSL
     RA_CAP_FRAGCOORD      = 1 << 10, // supports reading from gl_FragCoord
+    RA_CAP_PARALLEL_COMPUTE  = 1 << 11, // supports parallel compute shaders
 };
 
 enum ra_ctype {
@@ -84,6 +85,8 @@ struct ra_format {
                             // only applies to 2-component textures
     bool linear_filter;     // linear filtering available from shader
     bool renderable;        // can be used for render targets
+    bool dummy_format;      // is not a real ra_format but a fake one (e.g. FBO).
+                            // dummy formats cannot be used to create textures
 
     // If not 0, the format represents some sort of packed fringe format, whose
     // shader representation is given by the special_imgfmt_desc pointer.
@@ -285,6 +288,9 @@ struct ra_renderpass_params {
     enum ra_blend blend_src_alpha;
     enum ra_blend blend_dst_alpha;
 
+    // If true, the contents of `target` not written to will become undefined
+    bool invalidate_target;
+
     // --- type==RA_RENDERPASS_TYPE_COMPUTE only
 
     // Shader text, like vertex_shader/frag_shader.

diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
@@ -786,11 +786,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
         ADD(header, "#define texture texture2D\n");
     }
 
-    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
-        ADD(header, "#define gl_GlobalInvocationIndex "
-                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
-    }
-
     // Additional helpers.
     ADD(header, "#define LUT_POS(x, lut_size)"
                 " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
@@ -974,13 +969,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
 }
 
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
+                                        struct ra_tex *target, bool discard,
                                         const struct ra_renderpass_input *vao,
                                         int vao_len, size_t vertex_stride,
                                         void *vertices, size_t num_vertices)
 {
     struct timer_pool *timer = NULL;
 
+    sc->params.invalidate_target = discard;
     gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format,
                    vao, vao_len, vertex_stride);
     if (!sc->current_shader)

diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h
@@ -50,7 +50,7 @@ void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_dst_alpha);
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
+                                        struct ra_tex *target, bool discard,
                                         const struct ra_renderpass_input *vao,
                                         int vao_len, size_t vertex_stride,
                                         void *ptr, size_t num);

diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
@@ -1134,7 +1134,7 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
 }
 
 static struct mp_pass_perf render_pass_quad(struct gl_video *p,
-                                            struct ra_fbo fbo,
+                                            struct ra_fbo fbo, bool discard,
                                             const struct mp_rect *dst)
 {
     // The first element is reserved for `vec2 position`
@@ -1192,15 +1192,15 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
             &p->tmp_vertex[num_vertex_attribs * 1],
             vertex_stride);
 
-    return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, num_vertex_attribs,
+    return gl_sc_dispatch_draw(p->sc, fbo.tex, discard, p->vao, num_vertex_attribs,
                                vertex_stride, p->tmp_vertex, num_vertices);
 }
 
 static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo,
-                            const struct mp_rect *dst)
+                            bool discard, const struct mp_rect *dst)
 {
     pass_prepare_src_tex(p);
-    pass_record(p, render_pass_quad(p, fbo, dst));
+    pass_record(p, render_pass_quad(p, fbo, discard, dst));
     debug_check_gl(p, "after rendering");
     cleanup_binds(p);
 }
@@ -1218,6 +1218,11 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
         return;
     }
 
+    // If RA_CAP_PARALLEL_COMPUTE is set, try to prefer compute shaders
+    // over fragment shaders wherever possible.
+    if (!p->pass_compute.active && (p->ra->caps & RA_CAP_PARALLEL_COMPUTE))
+        pass_is_compute(p, 16, 16);
+
     if (p->pass_compute.active) {
         gl_sc_uniform_image2D_wo(p->sc, "out_image", *dst_tex);
         if (!p->pass_compute.directly_writes)
@@ -1229,7 +1234,7 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
         debug_check_gl(p, "after dispatching compute shader");
     } else {
         struct ra_fbo fbo = { .tex = *dst_tex, };
-        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h});
+        finish_pass_fbo(p, fbo, true, &(struct mp_rect){0, 0, w, h});
     }
 }
 
@@ -2788,7 +2793,7 @@ static void pass_draw_to_screen(struct gl_video *p, struct ra_fbo fbo)
 
     pass_dither(p);
     pass_describe(p, "output to screen");
-    finish_pass_fbo(p, fbo, &p->dst_rect);
+    finish_pass_fbo(p, fbo, false, &p->dst_rect);
 }
 
 static bool update_surface(struct gl_video *p, struct mp_image *mpi,
@@ -3053,9 +3058,15 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
                 if (frame->num_vsyncs > 1 && frame->display_synced &&
                     !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
                 {
+                    // Attempt to use the same format as the destination FBO
+                    // if possible. Some RAs use a wrapped dummy format here,
+                    // so fall back to the fbo_format in that case.
+                    const struct ra_format *fmt = fbo.tex->params.format;
+                    if (fmt->dummy_format)
+                        fmt = p->fbo_format;
                     bool r = ra_tex_resize(p->ra, p->log, &p->output_tex,
                                            fbo.tex->params.w, fbo.tex->params.h,
-                                           p->fbo_format);
+                                           fmt);
                     if (r) {
                         dest_fbo = (struct ra_fbo) { p->output_tex };
                         p->output_tex_valid = true;
@@ -3194,7 +3205,7 @@ static void reinterleave_vdpau(struct gl_video *p,
         const struct ra_format *fmt = ra_find_unorm_format(p->ra, 1, comps);
         ra_tex_resize(p->ra, p->log, tex, w, h * 2, fmt);
         struct ra_fbo fbo = { *tex };
-        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h * 2});
+        finish_pass_fbo(p, fbo, true, &(struct mp_rect){0, 0, w, h * 2});
 
         output[n] = *tex;
     }

diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c
@@ -283,6 +283,8 @@ static struct ra_tex *gl_tex_create(struct ra *ra,
                                     const struct ra_tex_params *params)
 {
     GL *gl = ra_gl_get(ra);
+    assert(!params->format->dummy_format);
+
     struct ra_tex *tex = gl_tex_create_blank(ra, params);
     if (!tex)
         return NULL;
@@ -382,6 +384,7 @@ static const struct ra_format fbo_dummy_format = {
         .flags = F_CR,
     },
     .renderable = true,
+    .dummy_format = true,
 };
 
 // Create a ra_tex that merely wraps an existing framebuffer. gl_fbo can be 0
@@ -996,6 +999,10 @@ static void gl_renderpass_run(struct ra *ra,
         assert(params->target->params.render_dst);
         assert(params->target->params.format == pass->params.target_format);
         gl->BindFramebuffer(GL_FRAMEBUFFER, target_gl->fbo);
+        if (pass->params.invalidate_target && gl->InvalidateFramebuffer) {
+            GLenum fb = target_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+            gl->InvalidateFramebuffer(GL_FRAMEBUFFER, 1, &fb);
+        }
         gl->Viewport(params->viewport.x0, params->viewport.y0,
                      mp_rect_w(params->viewport),
                      mp_rect_h(params->viewport));

diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
@@ -48,10 +48,27 @@ struct mpvk_ctx {
     VkSurfaceKHR surf;
     VkSurfaceFormatKHR surf_format; // picked at surface initialization time
 
-    struct vk_malloc *alloc; // memory allocator for this device
-    struct vk_cmdpool *pool; // primary command pool for this device
-    struct vk_cmd *last_cmd; // most recently submitted command
+    struct vk_malloc *alloc;      // memory allocator for this device
     struct spirv_compiler *spirv; // GLSL -> SPIR-V compiler
+    struct vk_cmdpool **pools;    // command pools (one per queue family)
+    int num_pools;
+    struct vk_cmd *last_cmd;      // most recently submitted command
+
+    // Queued/pending commands. These are shared for the entire mpvk_ctx to
+    // ensure submission and callbacks are FIFO
+    struct vk_cmd **cmds_queued;  // recorded but not yet submitted
+    struct vk_cmd **cmds_pending; // submitted but not completed
+    int num_cmds_queued;
+    int num_cmds_pending;
+
+    // Pointers into *pools
+    struct vk_cmdpool *pool_graphics; // required
+    struct vk_cmdpool *pool_compute;  // optional
+    struct vk_cmdpool *pool_transfer; // optional
+
+    // Common pool of signals, to avoid having to re-create these objects often
+    struct vk_signal **signals;
+    int num_signals;
 
     // Cached capabilities
     VkPhysicalDeviceLimits limits;