Skip to content

Commit

Permalink
feat: sync whisper.cpp (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Nov 15, 2023
1 parent 3c74201 commit f1b290b
Show file tree
Hide file tree
Showing 14 changed files with 1,526 additions and 1,719 deletions.
2 changes: 1 addition & 1 deletion cpp/coreml/whisper-encoder-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v

/**
Make a prediction using the convenience interface
@param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
@param logmel_data as 1 × n_mel × 3000 3-dimensional array of floats:
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as whisper_encoder_implOutput
*/
Expand Down
4 changes: 4 additions & 0 deletions cpp/coreml/whisper-encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// Code is derived from the work of Github user @wangchou
// ref: https://github.com/wangchou/callCoreMLFromCpp

#include <stdint.h>

#if __cplusplus
extern "C" {
#endif
Expand All @@ -14,6 +16,8 @@ void whisper_coreml_free(struct whisper_coreml_context * ctx);

void whisper_coreml_encode(
const whisper_coreml_context * ctx,
int64_t n_ctx,
int64_t n_mel,
float * mel,
float * out);

Expand Down
6 changes: 4 additions & 2 deletions cpp/coreml/whisper-encoder.mm
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@ void whisper_coreml_free(struct whisper_coreml_context * ctx) {

void whisper_coreml_encode(
const whisper_coreml_context * ctx,
int64_t n_ctx,
int64_t n_mel,
float * mel,
float * out) {
MLMultiArray * inMultiArray = [
[MLMultiArray alloc] initWithDataPointer: mel
shape: @[@1, @80, @3000]
shape: @[@1, @(n_mel), @(n_ctx)]
dataType: MLMultiArrayDataTypeFloat32
strides: @[@(240000), @(3000), @1]
strides: @[@(n_ctx*n_mel), @(n_ctx), @1]
deallocator: nil
error: nil
];
Expand Down
108 changes: 107 additions & 1 deletion cpp/ggml-metal-whisper.metal
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ kernel void kernel_mul_mv_f32_f32(
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
uint tiisg[[thread_index_in_simdgroup]]) {

const int64_t r0 = tgpig.x;
const int64_t rb = tgpig.y*N_F32_F32;
Expand Down Expand Up @@ -844,6 +844,79 @@ kernel void kernel_mul_mv_f32_f32(
}
}

#define N_F16_F16 4

kernel void kernel_mul_mv_f16_f16(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {

const int64_t r0 = tgpig.x;
const int64_t rb = tgpig.y*N_F16_F16;
const int64_t im = tgpig.z;

device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);

if (ne00 < 128) {
for (int row = 0; row < N_F16_F16; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}

device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);

float sumf = 0;
for (int i = tiisg; i < ne00; i += 32) {
sumf += (half) x[i] * (half) y[i];
}

float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
} else {
device const half4 * x4 = (device const half4 *)x;
for (int row = 0; row < N_F16_F16; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}

device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
device const half4 * y4 = (device const half4 *) y;

float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
}

float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
}

kernel void kernel_mul_mv_f16_f32_1row(
device const char * src0,
device const char * src1,
Expand Down Expand Up @@ -1229,6 +1302,39 @@ kernel void kernel_rope(
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;

kernel void kernel_im2col_f16(
device const float * x,
device half * dst,
constant int32_t & ofs0,
constant int32_t & ofs1,
constant int32_t & IW,
constant int32_t & IH,
constant int32_t & CHW,
constant int32_t & s0,
constant int32_t & s1,
constant int32_t & p0,
constant int32_t & p1,
constant int32_t & d0,
constant int32_t & d1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;

const int32_t offset_dst =
(tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
(tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);

if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] = 0.0f;
} else {
const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
dst[offset_dst] = x[offset_src + iih * IW + iiw];
}
}

kernel void kernel_cpy_f16_f16(
device const half * src0,
device half * dst,
Expand Down
2 changes: 1 addition & 1 deletion cpp/ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define WSP_GGML_METAL_MAX_BUFFERS 16
#define WSP_GGML_METAL_MAX_BUFFERS 64
#define WSP_GGML_METAL_MAX_COMMAND_BUFFERS 32

struct wsp_ggml_tensor;
Expand Down
89 changes: 79 additions & 10 deletions cpp/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
WSP_GGML_METAL_DECL_KERNEL(rms_norm);
WSP_GGML_METAL_DECL_KERNEL(norm);
WSP_GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
WSP_GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
WSP_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
WSP_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
WSP_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
Expand Down Expand Up @@ -114,6 +115,7 @@
WSP_GGML_METAL_DECL_KERNEL(rope_f32);
WSP_GGML_METAL_DECL_KERNEL(rope_f16);
WSP_GGML_METAL_DECL_KERNEL(alibi_f32);
WSP_GGML_METAL_DECL_KERNEL(im2col_f16);
WSP_GGML_METAL_DECL_KERNEL(cpy_f32_f16);
WSP_GGML_METAL_DECL_KERNEL(cpy_f32_f32);
WSP_GGML_METAL_DECL_KERNEL(cpy_f16_f16);
Expand All @@ -126,7 +128,7 @@
// MSL code
// TODO: move the contents here when ready
// for now it is easier to work in a separate file
static NSString * const msl_library_source = @"see metal.metal";
//static NSString * const msl_library_source = @"see metal.metal";

// Here to assist with NSBundle Path Hack
@interface WSPGGMLMetalClass : NSObject
Expand All @@ -142,7 +144,8 @@ void wsp_ggml_metal_log_set_callback(wsp_ggml_log_callback log_callback, void *
wsp_ggml_metal_log_user_data = user_data;
}

static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char* format, ...){
WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char * format, ...){
if (wsp_ggml_metal_log_callback != NULL) {
va_list args;
va_start(args, format);
Expand Down Expand Up @@ -287,6 +290,7 @@ static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char* format
WSP_GGML_METAL_ADD_KERNEL(rms_norm);
WSP_GGML_METAL_ADD_KERNEL(norm);
WSP_GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
WSP_GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
WSP_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
WSP_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
WSP_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
Expand Down Expand Up @@ -317,6 +321,7 @@ static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char* format
WSP_GGML_METAL_ADD_KERNEL(rope_f32);
WSP_GGML_METAL_ADD_KERNEL(rope_f16);
WSP_GGML_METAL_ADD_KERNEL(alibi_f32);
WSP_GGML_METAL_ADD_KERNEL(im2col_f16);
WSP_GGML_METAL_ADD_KERNEL(cpy_f32_f16);
WSP_GGML_METAL_ADD_KERNEL(cpy_f32_f32);
WSP_GGML_METAL_ADD_KERNEL(cpy_f16_f16);
Expand All @@ -335,7 +340,7 @@ static void wsp_ggml_metal_log(enum wsp_ggml_log_level level, const char* format
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
if ([ctx->device supportsFamily:i]) {
WSP_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
WSP_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
break;
}
}
Expand Down Expand Up @@ -384,6 +389,7 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
WSP_GGML_METAL_DEL_KERNEL(rms_norm);
WSP_GGML_METAL_DEL_KERNEL(norm);
WSP_GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
WSP_GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
WSP_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
WSP_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
WSP_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
Expand Down Expand Up @@ -414,6 +420,7 @@ void wsp_ggml_metal_free(struct wsp_ggml_metal_context * ctx) {
WSP_GGML_METAL_DEL_KERNEL(rope_f32);
WSP_GGML_METAL_DEL_KERNEL(rope_f16);
WSP_GGML_METAL_DEL_KERNEL(alibi_f32);
WSP_GGML_METAL_DEL_KERNEL(im2col_f16);
WSP_GGML_METAL_DEL_KERNEL(cpy_f32_f16);
WSP_GGML_METAL_DEL_KERNEL(cpy_f32_f32);
WSP_GGML_METAL_DEL_KERNEL(cpy_f16_f16);
Expand Down Expand Up @@ -461,6 +468,10 @@ int wsp_ggml_metal_if_optimized(struct wsp_ggml_metal_context * ctx) {

const int64_t tsize = wsp_ggml_nbytes(t);

if (t->buffer && t->buffer->backend && t->buffer->backend->context) {
ctx = t->buffer->backend->context;
}

// find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
Expand Down Expand Up @@ -561,7 +572,7 @@ bool wsp_ggml_metal_add_buffer(
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);

if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
WSP_GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
WSP_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
} else {
WSP_GGML_METAL_LOG_INFO("\n");
}
Expand Down Expand Up @@ -1127,20 +1138,26 @@ void wsp_ggml_metal_graph_compute(
switch (src0t) {
case WSP_GGML_TYPE_F32:
{
WSP_GGML_ASSERT(src1t == WSP_GGML_TYPE_F32);
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
nrows = 4;
} break;
case WSP_GGML_TYPE_F16:
{
nth0 = 32;
nth1 = 1;
if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
nrows = ne11;
if (src1t == WSP_GGML_TYPE_F32) {
if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
nrows = ne11;
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
nrows = 4;
}
} else {
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
nrows = 4;
}
} break;
Expand Down Expand Up @@ -1452,6 +1469,58 @@ void wsp_ggml_metal_graph_compute(

[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case WSP_GGML_OP_IM2COL:
{
WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F16);

const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;

const int32_t N = src1->ne[is_2D ? 3 : 2];
const int32_t IC = src1->ne[is_2D ? 2 : 1];
const int32_t IH = is_2D ? src1->ne[1] : 1;
const int32_t IW = src1->ne[0];

const int32_t KH = is_2D ? src0->ne[1] : 1;
const int32_t KW = src0->ne[0];

const int32_t OH = is_2D ? dst->ne[2] : 1;
const int32_t OW = dst->ne[1];

const int32_t CHW = IC * KH * KW;

const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;

switch (src0->type) {
case WSP_GGML_TYPE_F32: WSP_GGML_ASSERT(false && "not implemented"); break;
case WSP_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
default: WSP_GGML_ASSERT(false);
};

[encoder setBuffer:id_src1 offset:offs_src1 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2];
[encoder setBytes:&ofs1 length:sizeof( int32_t) atIndex:3];
[encoder setBytes:&IW length:sizeof( int32_t) atIndex:4];
[encoder setBytes:&IH length:sizeof( int32_t) atIndex:5];
[encoder setBytes:&CHW length:sizeof( int32_t) atIndex:6];
[encoder setBytes:&s0 length:sizeof( int32_t) atIndex:7];
[encoder setBytes:&s1 length:sizeof( int32_t) atIndex:8];
[encoder setBytes:&p0 length:sizeof( int32_t) atIndex:9];
[encoder setBytes:&p1 length:sizeof( int32_t) atIndex:10];
[encoder setBytes:&d0 length:sizeof( int32_t) atIndex:11];
[encoder setBytes:&d1 length:sizeof( int32_t) atIndex:12];

[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
} break;
case WSP_GGML_OP_DUP:
case WSP_GGML_OP_CPY:
case WSP_GGML_OP_CONT:
Expand Down

0 comments on commit f1b290b

Please sign in to comment.