Skip to content

Commit

Permalink
feat: sync whisper.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Nov 29, 2023
1 parent f1b290b commit 1c0d893
Show file tree
Hide file tree
Showing 13 changed files with 894 additions and 765 deletions.
23 changes: 12 additions & 11 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,12 +446,14 @@ static wsp_ggml_tallocr_t node_tallocr(wsp_ggml_gallocr_t galloc, struct wsp_ggm
return galloc->hash_allocs[wsp_ggml_hash_find_or_insert(galloc->hash_set, node)];
}

static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view) {
static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view, bool update_backend) {
wsp_ggml_tallocr_t alloc = node_tallocr(galloc, view);

//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
WSP_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
view->backend = view->view_src->backend;
if (update_backend) {
view->backend = view->view_src->backend;
}
view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

Expand All @@ -469,7 +471,7 @@ static void allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * no

if (node->data == NULL) {
if (wsp_ggml_is_view(node)) {
init_view(galloc, node);
init_view(galloc, node, true);
} else {
// see if we can reuse a parent's buffer (inplace)
if (wsp_ggml_op_can_inplace(node->op)) {
Expand Down Expand Up @@ -499,15 +501,14 @@ static void allocate_node(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * no
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->view_src = view_src;
view_src_hn->n_views += 1;
init_view(galloc, node);
init_view(galloc, node, false);
return;
}
}
else {
} else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->view_src = parent;
p_hn->n_views += 1;
init_view(galloc, node);
init_view(galloc, node, false);
return;
}
}
Expand Down Expand Up @@ -537,7 +538,7 @@ static void wsp_ggml_tallocr_alloc_graph_impl(wsp_ggml_gallocr_t galloc, struct
hash_get(galloc, view_src)->n_views += 1;
if (node->buffer == NULL && node->data != NULL) {
// view of a pre-allocated tensor, didn't call init_view() yet
init_view(galloc, node);
init_view(galloc, node, true);
}
}

Expand All @@ -548,7 +549,7 @@ static void wsp_ggml_tallocr_alloc_graph_impl(wsp_ggml_gallocr_t galloc, struct
}
hash_get(galloc, parent)->n_children += 1;
if (wsp_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
init_view(galloc, parent);
init_view(galloc, parent, true);
}
}
}
Expand Down Expand Up @@ -663,7 +664,7 @@ size_t wsp_ggml_gallocr_alloc_graph(wsp_ggml_gallocr_t galloc, wsp_ggml_tallocr_
return max_size;
}

void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, struct wsp_ggml_hash_set hash_set, wsp_ggml_tallocr_t * hash_node_alloct) {
void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_cgraph * graph, struct wsp_ggml_hash_set hash_set, wsp_ggml_tallocr_t * hash_node_talloc) {
const size_t hash_size = hash_set.size;

WSP_GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
Expand All @@ -686,7 +687,7 @@ void wsp_ggml_gallocr_alloc_graph_n(wsp_ggml_gallocr_t galloc, struct wsp_ggml_c
// reset hash values
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);

galloc->hash_allocs = hash_node_alloct;
galloc->hash_allocs = hash_node_talloc;

wsp_ggml_tallocr_alloc_graph_impl(galloc, graph);

Expand Down
7 changes: 7 additions & 0 deletions cpp/ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx);
void * wsp_ggml_metal_host_malloc(size_t n);
void wsp_ggml_metal_host_free (void * data);

// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
bool wsp_ggml_metal_supports_family(struct wsp_ggml_metal_context * ctx, int family);

// set the number of command buffers to use
void wsp_ggml_metal_set_n_cb(struct wsp_ggml_metal_context * ctx, int n_cb);

Expand Down Expand Up @@ -100,6 +105,8 @@ WSP_GGML_API bool wsp_ggml_backend_is_metal(wsp_ggml_backend_t backend);

WSP_GGML_API void wsp_ggml_backend_metal_set_n_cb(wsp_ggml_backend_t backend, int n_cb);

WSP_GGML_API bool wsp_ggml_backend_metal_supports_family(wsp_ggml_backend_t backend, int family);

#ifdef __cplusplus
}
#endif
Expand Down
30 changes: 20 additions & 10 deletions cpp/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,9 @@ static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char * forma
}

WSP_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
WSP_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
if (ctx->device.maxTransferRate != 0) {
WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
} else {
WSP_GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
}
Expand Down Expand Up @@ -447,6 +447,10 @@ void wsp_ggml_metal_host_free(void * data) {
free(data);
}

bool wsp_ggml_metal_supports_family(struct wsp_ggml_metal_context * ctx, int family) {
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}

void wsp_ggml_metal_set_n_cb(struct wsp_ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = MIN(n_cb, WSP_GGML_METAL_MAX_BUFFERS);
}
Expand Down Expand Up @@ -529,11 +533,11 @@ bool wsp_ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
return false;
}

WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);

++ctx->n_buffers;
} else {
Expand All @@ -553,11 +557,11 @@ bool wsp_ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
return false;
}

WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
WSP_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
if (i + size_step < size) {
WSP_GGML_METAL_LOG_INFO("\n");
}
Expand All @@ -568,16 +572,16 @@ bool wsp_ggml_metal_add_buffer(

#if TARGET_OS_OSX
WSP_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
ctx->device.currentAllocatedSize / 1e6,
ctx->device.recommendedMaxWorkingSetSize / 1e6);

if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
WSP_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
} else {
WSP_GGML_METAL_LOG_INFO("\n");
}
#else
WSP_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
WSP_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
#endif
}

Expand Down Expand Up @@ -1060,7 +1064,7 @@ void wsp_ggml_metal_graph_compute(
WSP_GGML_ASSERT(ne00 == ne10);
WSP_GGML_ASSERT(ne03 == ne13);

const uint gqa = ne12/ne02;
const unsigned int gqa = ne12/ne02;

// find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel
Expand Down Expand Up @@ -1739,3 +1743,9 @@ void wsp_ggml_backend_metal_set_n_cb(wsp_ggml_backend_t backend, int n_cb) {

wsp_ggml_metal_set_n_cb(ctx, n_cb);
}

bool wsp_ggml_backend_metal_supports_family(wsp_ggml_backend_t backend, int family) {
struct wsp_ggml_metal_context * ctx = (struct wsp_ggml_metal_context *)backend->context;

return wsp_ggml_metal_supports_family(ctx, family);
}
5 changes: 5 additions & 0 deletions cpp/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
float max = x[0];
float sum_w = weights[0];
float sum_x = sum_w * x[0];
#ifdef HAVE_BUGGY_APPLE_LINKER
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
for (volatile int i = 1; i < n; ++i) {
#else
for (int i = 1; i < n; ++i) {
#endif
if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i];
float w = weights[i];
Expand Down

0 comments on commit 1c0d893

Please sign in to comment.