feat: sync whisper.cpp

mybigday · Nov 29, 2023 · 1c0d893 · 1c0d893
1 parent f1b290b
commit 1c0d893
Show file tree

Hide file tree

Showing 13 changed files with 894 additions and 765 deletions.
diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
@@ -446,12 +446,14 @@ static wsp_ggml_tallocr_t node_tallocr(wsp_ggml_gallocr_t galloc, struct wsp_ggm
     return galloc->hash_allocs[wsp_ggml_hash_find_or_insert(galloc->hash_set, node)];
 }
 
-static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view) {
+static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view, bool update_backend) {
     wsp_ggml_tallocr_t alloc = node_tallocr(galloc, view);
 
     //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
     WSP_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
-    view->backend = view->view_src->backend;
+    if (update_backend) {
+        view->backend = view->view_src->backend;
+    }
     view->buffer  = view->view_src->buffer;
     view->data    = (char *)view->view_src->data + view->view_offs;
 
@@ -469,7 +471,7 @@ static void allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * no
 
     if (node->data == NULL) {
         if (wsp_ggml_is_view(node)) {
-            init_view(galloc, node);
+            init_view(galloc, node, true);
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (wsp_ggml_op_can_inplace(node->op)) {
@@ -499,15 +501,14 @@ static void allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * no
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                 node->view_src = view_src;
                                 view_src_hn->n_views += 1;
-                                init_view(galloc, node);
+                                init_view(galloc, node, false);
                                 return;
                             }
-                        }
-                        else {
+                        } else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                             node->view_src = parent;
                             p_hn->n_views += 1;
-                            init_view(galloc, node);
+                            init_view(galloc, node, false);
                             return;
                         }
                     }
@@ -537,7 +538,7 @@ static void wsp_ggml_tallocr_alloc_graph_impl(wsp_ggml_gallocr_t galloc, struct
             hash_get(galloc, view_src)->n_views += 1;
             if (node->buffer == NULL && node->data != NULL) {
                 // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node);
+                init_view(galloc, node, true);
             }
         }
 
@@ -548,7 +549,7 @@ static void wsp_ggml_tallocr_alloc_graph_impl(wsp_ggml_gallocr_t galloc, struct
             }
             hash_get(galloc, parent)->n_children += 1;
             if (wsp_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent);
+                init_view(galloc, parent, true);
             }
         }
    }
@@ -663,7 +664,7 @@ size_t wsp_ggml_gallocr_alloc_graph(wsp_ggml_gallocr_t galloc, wsp_ggml_tallocr_
     return max_size;
 }
 
-void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, struct wsp_ggml_hash_set hash_set, wsp_ggml_tallocr_t * hash_node_alloct) {
+void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, struct wsp_ggml_hash_set hash_set, wsp_ggml_tallocr_t * hash_node_talloc) {
     const size_t hash_size = hash_set.size;
 
     WSP_GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@@ -686,7 +687,7 @@ void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_c
     // reset hash values
     memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
 
-    galloc->hash_allocs = hash_node_alloct;
+    galloc->hash_allocs = hash_node_talloc;
 
     wsp_ggml_tallocr_alloc_graph_impl(galloc, graph);
 

diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h
@@ -52,6 +52,11 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx);
 void * wsp_ggml_metal_host_malloc(size_t n);
 void   wsp_ggml_metal_host_free  (void * data);
 
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+bool wsp_ggml_metal_supports_family(struct wsp_ggml_metal_context * ctx, int family);
+
 // set the number of command buffers to use
 void wsp_ggml_metal_set_n_cb(struct wsp_ggml_metal_context * ctx, int n_cb);
 
@@ -100,6 +105,8 @@ WSP_GGML_API bool wsp_ggml_backend_is_metal(wsp_ggml_backend_t backend);
 
 WSP_GGML_API void wsp_ggml_backend_metal_set_n_cb(wsp_ggml_backend_t backend, int n_cb);
 
+WSP_GGML_API bool wsp_ggml_backend_metal_supports_family(wsp_ggml_backend_t backend, int family);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m
@@ -346,9 +346,9 @@ static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char * forma
     }
 
     WSP_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-    WSP_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    WSP_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
     if (ctx->device.maxTransferRate != 0) {
-        WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
     } else {
         WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
     }
@@ -447,6 +447,10 @@ void wsp_ggml_metal_host_free(void * data) {
     free(data);
 }
 
+bool wsp_ggml_metal_supports_family(struct wsp_ggml_metal_context * ctx, int family) {
+    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+}
+
 void wsp_ggml_metal_set_n_cb(struct wsp_ggml_metal_context * ctx, int n_cb) {
     ctx->n_cb = MIN(n_cb, WSP_GGML_METAL_MAX_BUFFERS);
 }
@@ -529,11 +533,11 @@ bool wsp_ggml_metal_add_buffer(
             ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
             if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
                 return false;
             }
 
-            WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
 
             ++ctx->n_buffers;
         } else {
@@ -553,11 +557,11 @@ bool wsp_ggml_metal_add_buffer(
                 ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
                 if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
                     return false;
                 }
 
-                WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
                 if (i + size_step < size) {
                     WSP_GGML_METAL_LOG_INFO("\n");
                 }
@@ -568,16 +572,16 @@ bool wsp_ggml_metal_add_buffer(
 
 #if TARGET_OS_OSX
         WSP_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
-                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+                ctx->device.currentAllocatedSize / 1e6,
+                ctx->device.recommendedMaxWorkingSetSize / 1e6);
 
         if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
             WSP_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
         } else {
             WSP_GGML_METAL_LOG_INFO("\n");
         }
 #else
-        WSP_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
+        WSP_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
 #endif
     }
 
@@ -1060,7 +1064,7 @@ void wsp_ggml_metal_graph_compute(
                             WSP_GGML_ASSERT(ne00 == ne10);
                             WSP_GGML_ASSERT(ne03 == ne13);
 
-                            const uint gqa = ne12/ne02;
+                            const unsigned int gqa = ne12/ne02;
 
                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                             // to the matrix-vector kernel
@@ -1739,3 +1743,9 @@ void wsp_ggml_backend_metal_set_n_cb(wsp_ggml_backend_t backend, int n_cb) {
 
     wsp_ggml_metal_set_n_cb(ctx, n_cb);
 }
+
+bool wsp_ggml_backend_metal_supports_family(wsp_ggml_backend_t backend, int family) {
+    struct wsp_ggml_metal_context * ctx = (struct wsp_ggml_metal_context *)backend->context;
+
+    return wsp_ggml_metal_supports_family(ctx, family);
+}
diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c
@@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
     float max = x[0];
     float sum_w = weights[0];
     float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
     for (int i = 1; i < n; ++i) {
+#endif
         if (x[i] < min) min = x[i];
         if (x[i] > max) max = x[i];
         float w = weights[i];