vo_opengl: refactor RA texture and buffer updates

- tex_uploads args are moved to a struct - the ability to directly upload texture data without going through a buffer is made explicit - the concept of buffer updates and buffer polling is made more explicit and generalized to buf_update as well (not just mapped buffers) - the ability to call tex_upload/buf_update on a tex/buf is made explicit during tex/buf creation - uploading from buffers now uses an explicit offset instead of implicitly comparing *src against buf->data, because not all buffers may actually be persistently mapped - the initial_data = immutable requirement is dropped. (May be re-added later for D3D11 if that ever becomes a thing) This change helps the vulkan abstraction immensely and also helps move common code (like the PBO pooling) out of ra_gl and into the opengl/utils.c This also technically has the side-benefit / side-constraint of using PBOs for OSD texture uploads as well, which actually seems to help performance on machines where --opengl-pbo is faster than the naive code path. Because of this, I decided to hook up the OSD code to the opengl-pbo option as well. One drawback of this refactor is that the GL_STREAM_COPY hack for texture uploads "got lost", but I think I'm happy with that going away anyway since DR almost fully deprecates it, and it's not the "right thing" anyway - but instead an nvidia-only hack to make this stuff work somewhat better on NUMA systems with discrete GPUs. Another change is that due to the way fencing works with ra_buf (we get one fence per ra_buf per upload) we have to use multiple ra_bufs instead of offsets into a shared buffer. But for OpenGL this is probably better anyway. It's possible that in future, we could support having independent “buffer slices” (each with their own fence/sync object), but this would be an optimization more than anything. I also think that we could address the underlying problem (memory closeness) differently by making the ra_vk memory allocator smart enough to chunk together allocations under the hood.
mpv-player · Aug 17, 2017 · 46d86da · 46d86da
1 parent 9ca5a2a
commit 46d86da
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 189 deletions.
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
@@ -309,6 +309,13 @@ static const struct gl_functions gl_functions[] = {
             {0}
         },
     },
+    {
+        .ver_core = 430,
+        .functions = (const struct gl_function[]) {
+            DEF_FN(InvalidateTexImage),
+            {0}
+        },
+    },
     {
         .ver_core = 430,
         .ver_es_core = 300,

diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
@@ -194,6 +194,7 @@ struct GL {
     void (GLAPIENTRY *UniformMatrix3fv)(GLint, GLsizei, GLboolean,
                                         const GLfloat *);
 
+    void (GLAPIENTRY *InvalidateTexImage)(GLuint, GLint);
     void (GLAPIENTRY *InvalidateFramebuffer)(GLenum, GLsizei, const GLenum *);
 
     GLsync (GLAPIENTRY *FenceSync)(GLenum, GLbitfield);

diff --git a/video/out/opengl/gl_utils.c b/video/out/opengl/gl_utils.c
@@ -269,72 +269,6 @@ void gl_set_debug_logger(GL *gl, struct mp_log *log)
         gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
 }
 
-// Upload a texture, going through a PBO. PBO supposedly can facilitate
-// asynchronous copy from CPU to GPU, so this is an optimization. Note that
-// changing format/type/tex_w/tex_h or reusing the PBO in the same frame can
-// ruin performance.
-// This call is like gl_upload_tex(), plus PBO management/use.
-// target, format, type, dataptr, stride, x, y, w, h: texture upload params
-//                                                    (see gl_upload_tex())
-// tex_w, tex_h: maximum size of the used texture
-// use_pbo: for convenience, if false redirects the call to gl_upload_tex
-void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
-                       GLenum target, GLenum format, GLenum type,
-                       int tex_w, int tex_h, const void *dataptr, int stride,
-                       int x, int y, int w, int h)
-{
-    assert(x >= 0 && y >= 0 && w >= 0 && h >= 0);
-    assert(x + w <= tex_w && y + h <= tex_h);
-
-    if (!use_pbo) {
-        gl_upload_tex(gl, target, format, type, dataptr, stride, x, y, w, h);
-        return;
-    }
-
-    // We align the buffer size to 4096 to avoid possible subregion
-    // dependencies. This is not a strict requirement (the spec requires no
-    // alignment), but a good precaution for performance reasons
-    size_t needed_size = stride * h;
-    size_t buffer_size = MP_ALIGN_UP(needed_size, 4096);
-
-    if (buffer_size != pbo->buffer_size)
-        gl_pbo_upload_uninit(pbo);
-
-    if (!pbo->buffer) {
-        pbo->gl = gl;
-        pbo->buffer_size = buffer_size;
-        gl->GenBuffers(1, &pbo->buffer);
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffer);
-        // Magic time: Because we memcpy once from RAM to the buffer, and then
-        // the GPU needs to read from this anyway, we actually *don't* want
-        // this buffer to be allocated in RAM. If we allocate it in VRAM
-        // instead, we can reduce this to a single copy: from RAM into VRAM.
-        // Unfortunately, drivers e.g. nvidia will think GL_STREAM_DRAW is best
-        // allocated on host memory instead of device memory, so we lie about
-        // the usage to fool the driver into giving us a buffer in VRAM instead
-        // of RAM, which can be significantly faster for our use case.
-        // Seriously, fuck OpenGL.
-        gl->BufferData(GL_PIXEL_UNPACK_BUFFER, NUM_PBO_BUFFERS * buffer_size,
-                       NULL, GL_STREAM_COPY);
-    }
-
-    uintptr_t offset = buffer_size * pbo->index;
-    pbo->index = (pbo->index + 1) % NUM_PBO_BUFFERS;
-
-    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffer);
-    gl->BufferSubData(GL_PIXEL_UNPACK_BUFFER, offset, needed_size, dataptr);
-    gl_upload_tex(gl, target, format, type, (void *)offset, stride, x, y, w, h);
-    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo)
-{
-    if (pbo->gl)
-        pbo->gl->DeleteBuffers(1, &pbo->buffer);
-
-    *pbo = (struct gl_pbo_upload){0};
-}
-
 int gl_get_fb_depth(GL *gl, int fbo)
 {
     if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))

diff --git a/video/out/opengl/gl_utils.h b/video/out/opengl/gl_utils.h
@@ -51,21 +51,6 @@ void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
 
 void gl_set_debug_logger(GL *gl, struct mp_log *log);
 
-#define NUM_PBO_BUFFERS 3
-
-struct gl_pbo_upload {
-    GL *gl;
-    int index;
-    GLuint buffer;
-    size_t buffer_size;
-};
-
-void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo,
-                       GLenum target, GLenum format,  GLenum type,
-                       int tex_w, int tex_h, const void *dataptr, int stride,
-                       int x, int y, int w, int h);
-void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo);
-
 int gl_get_fb_depth(GL *gl, int fbo);
 
 #endif
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
@@ -54,6 +54,7 @@ struct mpgl_osd_part {
     enum sub_bitmap_format format;
     int change_id;
     struct ra_tex *texture;
+    struct tex_upload pbo;
     int w, h;
     int num_subparts;
     int prev_num_subparts;
@@ -70,14 +71,15 @@ struct mpgl_osd {
     const struct ra_format *fmt_table[SUBBITMAP_COUNT];
     bool formats[SUBBITMAP_COUNT];
     bool change_flag; // for reporting to API user only
+    bool want_pbo;
     // temporary
     int stereo_mode;
     struct mp_osd_res osd_res;
     void *scratch;
 };
 
 struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd)
+                               struct osd_state *osd, bool want_pbo)
 {
     struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
     *ctx = (struct mpgl_osd) {
@@ -86,6 +88,7 @@ struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
         .ra = ra,
         .change_flag = true,
         .scratch = talloc_zero_size(ctx, 1),
+        .want_pbo = want_pbo,
     };
 
     ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
@@ -108,6 +111,7 @@ void mpgl_osd_destroy(struct mpgl_osd *ctx)
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct mpgl_osd_part *p = ctx->parts[n];
         ra_tex_free(ctx->ra, &p->texture);
+        tex_upload_uninit(ctx->ra, &p->pbo);
     }
     talloc_free(ctx);
 }
@@ -161,18 +165,22 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
             .format = fmt,
             .render_src = true,
             .src_linear = true,
+            .host_mutable = true,
         };
         osd->texture = ra_tex_create(ra, &params);
         if (!osd->texture)
             goto done;
     }
 
-    struct mp_rect rc = {0, 0, imgs->packed_w, imgs->packed_h};
-    ra->fns->tex_upload(ra, osd->texture, imgs->packed->planes[0],
-                        imgs->packed->stride[0], &rc, RA_TEX_UPLOAD_DISCARD,
-                        NULL);
+    struct ra_tex_upload_params params = {
+        .tex = osd->texture,
+        .src = imgs->packed->planes[0],
+        .invalidate = true,
+        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
+        .stride = imgs->packed->stride[0],
+    };
 
-    ok = true;
+    ok = tex_upload(ra, &osd->pbo, ctx->want_pbo, &params);
 
 done:
     return ok;

diff --git a/video/out/opengl/osd.h b/video/out/opengl/osd.h
@@ -9,7 +9,7 @@
 #include "sub/osd.h"
 
 struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd);
+                               struct osd_state *osd, bool want_pbo);
 void mpgl_osd_destroy(struct mpgl_osd *ctx);
 
 void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,

diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h
@@ -30,19 +30,14 @@ struct ra {
     // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
     struct ra_format **formats;
     int num_formats;
-
-    // GL-specific: if set, accelerate texture upload by using an additional
-    // buffer (i.e. uses more memory). Does not affect uploads done by
-    // ra_tex_create (if initial_data is set). Set by the RA user.
-    bool use_pbo;
 };
 
 enum {
     RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
     RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
     RA_CAP_BLIT           = 1 << 2, // supports ra_fns.blit
     RA_CAP_COMPUTE        = 1 << 3, // supports compute shaders
-    RA_CAP_PBO            = 1 << 4, // supports ra.use_pbo
+    RA_CAP_DIRECT_UPLOAD  = 1 << 4, // supports tex_upload without ra_buf
     RA_CAP_BUF_RW         = 1 << 5, // supports RA_VARTYPE_BUF_RW
     RA_CAP_NESTED_ARRAY   = 1 << 6, // supports nested arrays
 };
@@ -92,6 +87,7 @@ struct ra_tex_params {
     bool render_dst;        // must be useable as target texture in a shader
     bool blit_src;          // must be usable as a blit source
     bool blit_dst;          // must be usable as a blit destination
+    bool host_mutable;      // texture may be updated with tex_upload
     // When used as render source texture.
     bool src_linear;        // if false, use nearest sampling (whether this can
                             // be true depends on ra_format.linear_filter)
@@ -100,8 +96,9 @@ struct ra_tex_params {
     bool non_normalized;    // hack for GL_TEXTURE_RECTANGLE OSX idiocy
                             // always set to false, except in OSX code
     bool external_oes;      // hack for GL_TEXTURE_EXTERNAL_OES idiocy
-    // If non-NULL, the texture will be created with these contents, and is
-    // considered immutable afterwards (no upload, mapping, or rendering to it).
+    // If non-NULL, the texture will be created with these contents. Using
+    // this does *not* require setting host_mutable. Otherwise, the initial
+    // data is undefined.
     void *initial_data;
 };
 
@@ -118,6 +115,19 @@ struct ra_tex {
     void *priv;
 };
 
+struct ra_tex_upload_params {
+    struct ra_tex *tex; // Texture to upload to
+    bool invalidate;    // Discard pre-existing data not in the region uploaded
+    // Uploading from buffer:
+    struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`)
+    size_t buf_offset;  // Start of data within buffer (bytes)
+    // Uploading directly: (requires RA_CAP_DIRECT_UPLOAD)
+    const void *src;    // Address of data
+    // For 2D textures only:
+    struct mp_rect *rc; // Region to upload. NULL means entire image
+    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
+};
+
 // Buffer type hint. Setting this may result in more or less efficient
 // operation, although it shouldn't technically prohibit anything
 enum ra_buf_type {
@@ -129,8 +139,8 @@ enum ra_buf_type {
 struct ra_buf_params {
     enum ra_buf_type type;
     size_t size;
-    // Creates a read-writable persistent mapping (ra_buf.data)
-    bool host_mapped;
+    bool host_mapped;  // create a read-writable persistent mapping (ra_buf.data)
+    bool host_mutable; // contents may be updated via buf_update()
     // If non-NULL, the buffer will be created with these contents. Otherwise,
     // the initial data is undefined.
     void *initial_data;
@@ -288,11 +298,6 @@ struct ra_renderpass_run_params {
     int compute_groups[3];
 };
 
-enum {
-    // Flags for the texture_upload flags parameter.
-    RA_TEX_UPLOAD_DISCARD = 1 << 0, // discard pre-existing data not in the region
-};
-
 // This is an opaque type provided by the implementation, but we want to at
 // least give it a saner name than void* for code readability purposes.
 typedef void ra_timer;
@@ -311,27 +316,13 @@ struct ra_fns {
 
     void (*tex_destroy)(struct ra *ra, struct ra_tex *tex);
 
-    // Copy from CPU RAM to the texture. This is an extremely common operation.
-    // Unlike with OpenGL, the src data has to have exactly the same format as
-    // the texture, and no conversion is supported.
-    // region can be NULL - if it's not NULL, then the provided pointer only
-    // contains data for the given region. Only part of the texture data is
-    // updated, and ptr points to the first pixel in the region. If
-    // RA_TEX_UPLOAD_DISCARD is set, data outside of the region can return to
-    // an uninitialized state. The region is always strictly within the texture
-    // and has a size >0 in both dimensions. 2D textures only.
-    // For 1D textures, stride is ignored, and region must be NULL.
-    // For 3D textures, stride is not supported. All data is fully packed with
-    // no padding, and stride is ignored, and region must be NULL.
-    // If buf is not NULL, then src must be within the provided buffer. The
-    // operation is implied to have dramatically better performance, but
-    // requires correct flushing and fencing operations by the caller to deal
-    // with asynchronous host/GPU behavior. If any of these conditions are not
-    // met, undefined behavior will result.
-    void (*tex_upload)(struct ra *ra, struct ra_tex *tex,
-                       const void *src, ptrdiff_t stride,
-                       struct mp_rect *region, uint64_t flags,
-                       struct ra_buf *buf);
+    // Copy the contents of a buffer to a texture. This is an extremely common
+    // operation. The contents of the buffer must exactly match the format of
+    // the image - conversions between bit depth etc. are not supported.
+    // The buffer *may* be marked as "in use" while this operation is going on,
+    // and the contents must not be touched again by the API user until
+    // buf_poll returns true.
+    void (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
 
     // Create a buffer. This can be used as a persistently mapped buffer,
     // a uniform buffer, a shader storage buffer or possibly others.
@@ -341,13 +332,18 @@ struct ra_fns {
 
     void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
 
-    // Essentially a fence: once the GPU uses the mapping for read-access (e.g.
-    // by starting a texture upload), the host must not write to the mapped
-    // data until an internal object has been signalled. This call returns
-    // whether it was signalled yet. If true, write accesses are allowed again.
-    // Optional, may be NULL if unavailable. This is only usable for buffers
-    // which have been persistently mapped.
-    bool (*poll_mapped_buffer)(struct ra *ra, struct ra_buf *buf);
+    // Update the contents of a buffer, starting at a given offset and up to a
+    // given size, with the contents of *data. This is an extremely common
+    // operation. Calling this while the buffer is considered "in use" is an
+    // error. (See: buf_poll)
+    void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                       const void *data, size_t size);
+
+    // Returns if a buffer is currently "in use" or not. Updating the contents
+    // of a buffer (via buf_update or writing to buf->data) while it is still
+    // in use is an error and may result in graphical corruption. Optional, if
+    // NULL then all buffers are always usable.
+    bool (*buf_poll)(struct ra *ra, struct ra_buf *buf);
 
     // Clear the dst with the given color (rgba) and within the given scissor.
     // dst must have dst->params.render_dst==true. Content outside of the