Merge branch 'master' of https://github.com/nomic-ai/llama.cpp into n…

…omic-mpt
nomic-ai · Oct 19, 2023 · a8ed8c8 · a8ed8c8
2 parents 34a3fae + ffe96e1
commit a8ed8c8
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 675 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -463,6 +463,7 @@ if (LLAMA_KOMPUTE)
 
     if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         message(STATUS "Kompute found")
+        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
         add_subdirectory(kompute)
 
         # Compile our shaders
@@ -479,12 +480,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_norm.comp
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
-          kompute/op_mul_mat_mat_f16.comp
           kompute/op_mul_mat_mat_f32.comp
-          kompute/op_mul_mat_mat_q4_0.comp
-          kompute/op_mul_mat_mat_q4_1.comp
-          kompute/op_mul_mat_mat_q8_0.comp
-          kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -515,12 +511,7 @@ if (LLAMA_KOMPUTE)
           shaderop_norm.h
           shaderop_rmsnorm.h
           shaderop_diagmask.h
-          shaderop_mul_mat_mat_f16.h
           shaderop_mul_mat_mat_f32.h
-          shaderop_mul_mat_mat_q4_0.h
-          shaderop_mul_mat_mat_q4_1.h
-          shaderop_mul_mat_mat_q8_0.h
-          shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
@@ -14,7 +14,8 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+// device subgroup size
+layout (local_size_x_id = 0) in;
 
 layout(binding = 0) readonly buffer tensorInA { float inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -40,14 +41,20 @@ pcs;
 
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
 
-  const uint x = (gid.x*pcs.nb01 + gid.z/(pcs.ne12/pcs.ne02)*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + gid.z/(pcs.ne02/pcs.ne12)*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i ++) {
+  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
       sum += float(inA[x+i]) * float(inB[y+i]);
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect()) {
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
+  }
+}
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
diff --git a/kompute/op_mul_mat_mat_q4_1.comp b/kompute/op_mul_mat_mat_q4_1.comp
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp